In [50]:
import os
import pandas as pd
from tqdm import tqdm
import logging
import requests
from SPARQLWrapper import SPARQLWrapper, JSON
import time
import re
from collections import Counter

In [39]:
import json
from requests import get

def get_wikidata_item_tree_item_idsSPARQL(root_items, forward_properties=None, backward_properties=None):
    """Return ids of WikiData items, which are in the tree spanned by the given root items and claims relating them
        to other items.

    :param root_items: iterable[int] One or multiple item entities that are the root elements of the tree
    :param forward_properties: iterable[int] | None property-claims to follow forward; that is, if root item R has
        a claim P:I, and P is in the list, the search will branch recursively to item I as well.
    :param backward_properties: iterable[int] | None property-claims to follow in reverse; that is, if (for a root
        item R) an item I has a claim P:R, and P is in the list, the search will branch recursively to item I as well.
    :return: iterable[int]: List with ids of WikiData items in the tree
    """

    query = '''PREFIX wikibase: <http://wikiba.se/ontology#>
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>'''
    if forward_properties:
        query +='''SELECT ?WD_id WHERE {
                  ?tree0 (wdt:P%s)* ?WD_id .
                  BIND (wd:%s AS ?tree0)
                  }'''%( ','.join(map(str, forward_properties)),','.join(map(str, root_items)))
    elif backward_properties:
        query+='''SELECT ?WD_id WHERE {
                    ?WD_id (wdt:P%s)* wd:Q%s .
                    }'''%(','.join(map(str, backward_properties)), ','.join(map(str, root_items)))
    #print(query)

    url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    data = get(url, params={'query': query, 'format': 'json'}).json()
    
    ids = []
    for item in data['results']['bindings']:
        this_id=item["WD_id"]["value"].split("/")[-1].lstrip("Q")
        #print(item)
        try:
            this_id = int(this_id)
            ids.append(this_id)
            #print(this_id)
        except ValueError:
            #print("exception")
            continue
    return ids


try:
    organization_subclass = get_wikidata_item_tree_item_idsSPARQL([43229], backward_properties=[279])
except json.decoder.JSONDecodeError:
    organization_subclass = []

try:
    country_subclass = get_wikidata_item_tree_item_idsSPARQL([6256], backward_properties=[279])
except json.decoder.JSONDecodeError:
    country_subclass = []

try:
    city_subclass = get_wikidata_item_tree_item_idsSPARQL([515], backward_properties=[279])
except json.decoder.JSONDecodeError:
    city_subclass = []

try:
    capitals_subclass = get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])
except json.decoder.JSONDecodeError:
    capitals_subclass = []

try:
    admTerr_subclass = get_wikidata_item_tree_item_idsSPARQL([15916867], backward_properties=[279])
except json.decoder.JSONDecodeError:
    admTerr_subclass = []

try:
    family_subclass = get_wikidata_item_tree_item_idsSPARQL([17350442], backward_properties=[279])
except json.decoder.JSONDecodeError:
    family_subclass = []

try:
    sportLeague_subclass = get_wikidata_item_tree_item_idsSPARQL([623109], backward_properties=[279])
except json.decoder.JSONDecodeError:
    sportLeague_subclass = []

try:
    venue_subclass = get_wikidata_item_tree_item_idsSPARQL([8436], backward_properties=[279])
except json.decoder.JSONDecodeError:
    venue_subclass = []

# Removing overlaps for organization_subclass
organization_subclass = list(set(organization_subclass) - set(country_subclass) - set(city_subclass) - set(capitals_subclass) - set(admTerr_subclass) - set(family_subclass) - set(sportLeague_subclass) - set(venue_subclass))

try:
    geolocation_subclass = get_wikidata_item_tree_item_idsSPARQL([2221906], backward_properties=[279])
except json.decoder.JSONDecodeError:
    geolocation_subclass = []

try:
    food_subclass = get_wikidata_item_tree_item_idsSPARQL([2095], backward_properties=[279])
except json.decoder.JSONDecodeError:
    food_subclass = []

try:
    edInst_subclass = get_wikidata_item_tree_item_idsSPARQL([2385804], backward_properties=[279])
except json.decoder.JSONDecodeError:
    edInst_subclass = []

try:
    govAgency_subclass = get_wikidata_item_tree_item_idsSPARQL([327333], backward_properties=[279])
except json.decoder.JSONDecodeError:
    govAgency_subclass = []

try:
    intOrg_subclass = get_wikidata_item_tree_item_idsSPARQL([484652], backward_properties=[279])
except json.decoder.JSONDecodeError:
    intOrg_subclass = []

try:
    timeZone_subclass = get_wikidata_item_tree_item_idsSPARQL([12143], backward_properties=[279])
except json.decoder.JSONDecodeError:
    timeZone_subclass = []

# Removing overlaps for geolocation_subclass
geolocation_subclass = list(set(geolocation_subclass) - set(food_subclass) - set(edInst_subclass) - set(govAgency_subclass) - set(intOrg_subclass) - set(timeZone_subclass))



In [117]:
def split_entity_name(entity_name):
    """
    Splits the entity name into separate words if it has more than 2 uppercase letters.
    
    Args:
        entity_name (str): The input string to check and potentially split (e.g., 'VideoGame').
    
    Returns:
        str: The modified string with words split by spaces if necessary.
    """
    # Count uppercase letters
    uppercase_count = sum(1 for char in entity_name if char.isupper())
    
    # If more than 2 uppercase letters, split before every uppercase letter
    if uppercase_count >= 2:
        return re.sub(r'(?<!^)(?=[A-Z])', ' ', entity_name)
    
    return entity_name

def get_type_id(entity_name):
    """
    Queries the API to retrieve the QID of an entity based on its name.

    Args:
        entity_name (str): The name of the entity (e.g., 'Video Game').

    Returns:
        str: The QID associated with the given entity name, or None if an error occurs or no QID is found.
    """
    
    entity_name_split = split_entity_name(entity_name)

    url = f"https://lamapi.hel.sintef.cloud/lookup/entity-retrieval"
    params = {
        "name": entity_name_split,
        "kind": "type",
        "token": "lamapi_demo_2023"
    }
    headers = {
        "accept": "application/json"
    }

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()  # Raise an error for HTTP issues
        data = response.json()
        # Assuming the API returns the QID directly or within a key, adjust if structure differs
        #return max(data, key=lambda x: x.get('pos_score', 0))
        return data[0]['id']
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None


In [None]:
def retrieve_superclasses(entity_id):
    """
    Retrieve all superclasses of a given Wikidata entity ID.

    Args:
        entity_id (str): The ID of the entity (e.g., "Q207784").

    Returns:
        dict: A dictionary where keys are superclass IDs, and values are their labels.
    """
    # Define the SPARQL endpoint and query
    endpoint_url = "https://query.wikidata.org/sparql"
    query = f"""
    SELECT ?superclass ?superclassLabel WHERE {{
      wd:{entity_id} (wdt:P279)* ?superclass.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """

    # Function to query the SPARQL endpoint with retries
    def query_wikidata(sparql_client, query, retries=3, delay=5):
        for attempt in range(retries):
            try:
                sparql_client.setQuery(query)
                sparql_client.setReturnFormat(JSON)
                results = sparql_client.query().convert()
                return results
            except Exception as e:
                if "429" in str(e):  # Handle Too Many Requests error
                    print(f"Rate limit hit. Retrying in {delay} seconds... (Attempt {attempt + 1}/{retries})")
                    time.sleep(delay)
                else:
                    print(f"An error occurred: {e}")
                    break
        return None

    # Set up the SPARQL client
    sparql = SPARQLWrapper(endpoint_url)

    # Execute the query with retries
    results = query_wikidata(sparql, query)

    # Process results and return as a dictionary
    if results:
        superclass_dict = {}
        for result in results["results"]["bindings"]:
            superclass_id = result["superclass"]["value"].split("/")[-1]  # Extract entity ID from the URI
            label = result["superclassLabel"]["value"]
            superclass_dict[superclass_id] = label
        return superclass_dict
    else:
        print("Failed to retrieve data after multiple attempts.")
        return {}


In [131]:
def WD_types(entity_id):
    """
    Queries the API to retrieve the list of type IDs for a given entity ID.

    Args:
        entity_id (str): The Wikidata ID of the entity (e.g., 'Q30').

    Returns:
        list: A list of type IDs associated with the given entity, or an empty list if an error occurs.
    """

    url = "https://lamapi.hel.sintef.cloud/entity/types?token=lamapi_demo_2023"
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    payload = {"json": [entity_id]}
    
    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()  # Raise an error for HTTP issues
        ids = response.json()
        return ids[entity_id]['types']['P31']  # Parse and return the JSON response
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []
    except KeyError as e:
        # Handle missing keys in the JSON response
        return []

In [167]:
# hai un solo NER type per entità perchè lato query type hai associato un solo 
# explicit_WD_type da mappare


def NER_types(entity_id):

    """
    Given an entity it retrieves the list of the associated NER types making
    the NER type extension of all the WD_types associated to it.
    """
    if not entity_id:
        return None  

    match = re.search(r'Q(\d+)', entity_id)
    numeric_id = int(match.group(1))

    NERtype = []
    # Classify NER types
    if numeric_id == 5:
        return 'PERS'
    elif numeric_id in geolocation_subclass:
        return 'LOC'
    elif numeric_id in organization_subclass:
        return 'ORG'
    else:
        return 'OTHERS'
    

In [171]:
def extended_WD_types(entity_id):

    """
    Given an entity it retrieves the list of the associated extended WD types making
    the type extension of all the WD_types associated to it.
    """
        
    if not entity_id:
        return None

    return list(retrieve_superclasses(entity_id).keys())
    

## Dataset Reading

In [188]:

tables_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round4_2020/tables/"
cea_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round4_2020/gt/cea.csv'
cta_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round4_2020/gt/cta.csv'


os.listdir(tables_path)
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df_cea = pd.read_csv(cea_file, header=None)
df_cea["key"] = df_cea[0] + " " + df_cea[1].astype(str) + " " + df_cea[2].astype(str)
df_cea["key_col"] = df_cea[0] + " " + df_cea[2].astype(str)
cea_values_dict = dict(zip(df_cea["key_col"].values, df_cea[3].values))


df_cta = pd.read_csv(cta_file, header=None)
df_cta["key"] = df_cta[0] + " " + df_cta[1].astype(str)
cta_values_dict = dict(zip(df_cta["key"].values, df_cta[2].values))


# Find the intersection of keys between the two dictionaries
common_keys = set(cea_values_dict.keys()).intersection(cta_values_dict.keys())

# Create a new dictionary with combined values
combined_dict = {key: [cea_values_dict[key], cta_values_dict[key]] for key in common_keys}


In [189]:
cea_keys_set = set(df_cea["key"].values)
cea_values_dict_cell = dict(zip(df_cea["key"].values, df_cea[3].values))

# Function to process a single table file
def process_table_file(table_file):
    try:
        table_name = os.path.splitext(os.path.basename(table_file))[0]
        df = pd.read_csv(table_file)
        qid_to_value = {}

        for row in range(df.shape[0]):
            for col in range(df.shape[1]):
                key = f"{table_name} {row+1} {col}"
                if key in cea_keys_set:
                    cell_value = df.iloc[row, col]
                    qid = cea_values_dict_cell[key].split('/')[-1]  # Extract the QID from the URL
                    qid_to_value[qid] = cell_value
                    break  # Exit inner loop early as only one match per row/col is needed

        return qid_to_value
    except Exception as e:
        logging.error(f"Error processing {table_file}: {e}")
        return {}

# List of table files
table_files = [os.path.join(tables_path, table) for table in os.listdir(tables_path)]

# Process tables sequentially
key_to_cell = {}
for table_file in tqdm(table_files, desc="Processing tables"):
    local_key_to_cell = process_table_file(table_file)
    key_to_cell.update(local_key_to_cell)

Processing tables: 100%|██████████| 22207/22207 [04:55<00:00, 75.23it/s]


In [191]:
WD_query_type = {
    value[0].split('/')[-1]: value[1].split('/')[-1]
    for value in combined_dict.values()
}

WD_query_type

{'Q78752723': 'Q1332364',
 'Q1393651': 'Q11424',
 'Q2269559': 'Q623109',
 'Q1508': 'Q29946056',
 'Q930314': 'Q32880',
 'Q6723': 'Q17272482',
 'Q110462': 'Q5',
 'Q5963660': 'Q1366112',
 'Q5963': 'Q85332736',
 'Q76597335': 'Q83373',
 'Q468161': 'Q1344',
 'Q59341592': 'Q1980247',
 'Q1216506': 'Q11424',
 'Q13644': 'Q11276',
 'Q488892': 'Q13410400',
 'Q313404': 'Q1331380',
 'Q75264907': 'Q1153392',
 'Q61919529': 'Q1092832',
 'Q73789987': 'Q19860854',
 'Q652166': 'Q5',
 'Q78241263': 'Q1352333',
 'Q209373': 'Q21009057',
 'Q63891772': 'Q22936940',
 'Q738601': 'Q11387',
 'Q64863410': 'Q101965',
 'Q63869675': 'Q22936940',
 'Q2659397': 'Q27968055',
 'Q1536088': 'Q2633744',
 'Q62102192': 'Q932825',
 'Q80602795': 'Q217012',
 'Q577598': 'Q5',
 'Q14472988': 'Q1968122',
 'Q4132669': 'Q21505397',
 'Q3656721': 'Q15298259',
 'Q85319072': 'Q72053617',
 'Q749234': 'Q67206691',
 'Q508153': 'Q127448',
 'Q183': 'Q6256',
 'Q7893054': 'Q858439',
 'Q19652': 'Q50424085',
 'Q79087327': 'Q7725310',
 'Q3224202': 'Q4

In [194]:
with open('R4_WD_query_type.json', 'w') as json_file:
    json.dump(WD_query_type, json_file, indent=4)


In [82]:
#print(f"{type_str}: {get_type_id(type_str)}")

entity_name = key_to_cell[list(WD_query_type.keys())[0]]
ext_query_types = list(set(retrieve_superclasses("Q31")))

# query a lamapi dove specifico nel filtro il tipo

#WD_candidate_types = WD_types(entity_id)  # WD_types() interroga il servizio types() ma forse è sbagliato (da implementare lato server non client)
print(ext_query_types)

['Q31']


In [236]:
# Initialize an empty dictionary for NER_query_type
NER_query_type = {}

# Iterate over WD_query_type.keys() with a tqdm progress bar
for entity_name, entity_type in tqdm(WD_query_type.items(), desc="Processing Entities", unit="entity"):
    if entity_type is None:
        print("none")
        NER_query_type[entity_name] = None
    else:
        NER_query_type[entity_name] = NER_types(entity_type)

with open('HTR3_NER_query_type.json', 'w') as json_file:
    json.dump(NER_query_type, json_file, indent=4)


# Display the resulting dictionary
NER_query_type

Processing Entities:   0%|          | 0/47171 [00:00<?, ?entity/s]

Processing Entities: 100%|██████████| 47171/47171 [00:27<00:00, 1732.20entity/s]


{'Q64618013': 'OTHERS',
 'Q64617600': 'OTHERS',
 'Q56682821': 'OTHERS',
 'Q56612179': 'OTHERS',
 'Q43221084': 'OTHERS',
 'Q28890011': 'OTHERS',
 'Q19818313': 'OTHERS',
 'Q16941209': 'OTHERS',
 'Q16941198': 'OTHERS',
 'Q16941177': 'OTHERS',
 'Q16941174': 'OTHERS',
 'Q16941171': 'OTHERS',
 'Q16941169': 'OTHERS',
 'Q16941162': 'OTHERS',
 'Q16941157': 'OTHERS',
 'Q16941154': 'OTHERS',
 'Q16941150': 'OTHERS',
 'Q5856992': 'OTHERS',
 'Q62422071': 'OTHERS',
 'Q60999791': 'OTHERS',
 'Q105554399': 'OTHERS',
 'Q100250170': 'OTHERS',
 'Q99701350': 'OTHERS',
 'Q28096756': 'OTHERS',
 'Q21198118': 'OTHERS',
 'Q20546376': 'OTHERS',
 'Q43339738': 'OTHERS',
 'Q11390403': 'OTHERS',
 'Q10920066': 'OTHERS',
 'Q7074635': 'OTHERS',
 'Q5366046': 'OTHERS',
 'Q5362166': 'OTHERS',
 'Q5360937': 'OTHERS',
 'Q4145038': 'OTHERS',
 'Q15905509': 'OTHERS',
 'Q11886645': 'OTHERS',
 'Q1376060': 'OTHERS',
 'Q1213085': 'OTHERS',
 'Q1164830': 'OTHERS',
 'Q3761739': 'OTHERS',
 'Q3289889': 'OTHERS',
 'Q1042011': 'OTHERS',
 '

In [239]:
extended_WD_types("Q23397")

['Q23397',
 'Q271669',
 'Q337567',
 'Q2479431',
 'Q3391202',
 'Q618123',
 'Q2221906',
 'Q12766313',
 'Q15324',
 'Q863944',
 'Q1389310',
 'Q52105',
 'Q82794',
 'Q20719696',
 'Q27096213',
 'Q2507626',
 'Q4835091',
 'Q35145263',
 'Q43619',
 'Q26713767',
 'Q15989253',
 'Q223557',
 'Q123349660',
 'Q16686022',
 'Q27096220',
 'Q115385384',
 'Q2920644',
 'Q4406616',
 'Q53617407',
 'Q35120',
 'Q1970309',
 'Q23956024',
 'Q25404640',
 'Q124711484',
 'Q488383',
 'Q58778',
 'Q58415929',
 'Q53617489',
 'Q124711467',
 'Q99527517',
 'Q103940464']

In [237]:
# Initialize an empty dictionary for ext_WD_query_type
ext_WD_query_type = {}

# Iterate over WD_query_type.keys() with a tqdm progress bar
for entity_name, entity_type in tqdm(WD_query_type.items(), desc="Processing Superclasses", unit="entity"):
    #print(f"{el}: #superclasses {len(extended_WD_types(el))}")
    ext_WD_query_type[entity_name] = extended_WD_types(entity_type)

with open('HTR3_ext_WD_query_type.json', 'w') as json_file:
    json.dump(ext_WD_query_type, json_file, indent=4)


# Display the resulting dictionary
ext_WD_query_type

Processing Superclasses:   0%|          | 35/47171 [00:09<3:07:55,  4.18entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   0%|          | 68/47171 [00:26<5:59:43,  2.18entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   7%|▋         | 3386/47171 [09:36<3:00:17,  4.05entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   8%|▊         | 3984/47171 [11:17<1:11:21, 10.09entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   8%|▊         | 3997/47171 [11:27<2:55:18,  4.10entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  15%|█▍        | 7032/47171 [19:35<3:24:48,  3.27entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  15%|█▍        | 7054/47171 [19:46<3:15:28,  3.42entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  15%|█▌        | 7083/47171 [19:56<1:27:41,  7.62entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  15%|█▌        | 7117/47171 [20:08<3:28:23,  3.20entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)
Rate limit hit. Retrying in 5 seconds... (Attempt 2/3)


Processing Superclasses:  15%|█▌        | 7118/47171 [20:19<39:23:14,  3.54s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  15%|█▌        | 7120/47171 [20:25<33:57:31,  3.05s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  22%|██▏       | 10418/47171 [29:07<4:22:15,  2.34entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  22%|██▏       | 10477/47171 [29:29<1:23:11,  7.35entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  22%|██▏       | 10509/47171 [29:40<1:04:54,  9.41entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  22%|██▏       | 10559/47171 [29:51<1:41:09,  6.03entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  29%|██▉       | 13882/47171 [41:13<1:02:09,  8.93entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  30%|██▉       | 13922/47171 [41:25<1:36:00,  5.77entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  35%|███▌      | 16583/47171 [49:32<56:12,  9.07entity/s]   

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  41%|████▏     | 19558/47171 [59:19<2:17:16,  3.35entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  41%|████▏     | 19562/47171 [59:25<6:22:17,  1.20entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  48%|████▊     | 22835/47171 [1:09:39<54:07,  7.49entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  48%|████▊     | 22846/47171 [1:09:46<2:06:39,  3.20entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  55%|█████▍    | 25812/47171 [1:19:36<1:08:08,  5.22entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  55%|█████▍    | 25813/47171 [1:19:42<10:13:55,  1.72s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  55%|█████▍    | 25825/47171 [1:19:48<57:07,  6.23entity/s]   

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  61%|██████    | 28723/47171 [1:29:36<1:07:35,  4.55entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  64%|██████▎   | 29988/47171 [1:39:39<3:06:51,  1.53entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  64%|██████▎   | 29992/47171 [1:39:45<4:33:33,  1.05entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  66%|██████▌   | 31220/47171 [1:49:07<3:56:06,  1.13entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  74%|███████▎  | 34734/47171 [2:19:18<1:13:27,  2.82entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  76%|███████▌  | 35966/47171 [2:29:15<1:32:32,  2.02entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  76%|███████▋  | 35978/47171 [2:29:28<1:23:19,  2.24entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  76%|███████▋  | 35983/47171 [2:29:35<2:30:31,  1.24entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)
Rate limit hit. Retrying in 5 seconds... (Attempt 2/3)


Processing Superclasses:  76%|███████▋  | 35990/47171 [2:29:49<2:32:07,  1.23entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  76%|███████▋  | 35994/47171 [2:29:56<3:36:26,  1.16s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  82%|████████▏ | 38675/47171 [2:39:04<34:54,  4.06entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  82%|████████▏ | 38676/47171 [2:39:10<3:58:28,  1.68s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)
Rate limit hit. Retrying in 5 seconds... (Attempt 2/3)
Rate limit hit. Retrying in 5 seconds... (Attempt 3/3)


Processing Superclasses:  82%|████████▏ | 38677/47171 [2:39:26<13:17:19,  5.63s/entity]

Failed to retrieve data after multiple attempts.


Processing Superclasses:  82%|████████▏ | 38684/47171 [2:39:28<2:02:44,  1.15entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  82%|████████▏ | 38695/47171 [2:39:36<41:45,  3.38entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  82%|████████▏ | 38696/47171 [2:39:42<4:33:53,  1.94s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  82%|████████▏ | 38697/47171 [2:39:47<7:09:38,  3.04s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)
Rate limit hit. Retrying in 5 seconds... (Attempt 2/3)
Rate limit hit. Retrying in 5 seconds... (Attempt 3/3)


Processing Superclasses:  82%|████████▏ | 38698/47171 [2:40:03<16:01:10,  6.81s/entity]

Failed to retrieve data after multiple attempts.
Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  82%|████████▏ | 38699/47171 [2:40:09<15:04:11,  6.40s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  82%|████████▏ | 38720/47171 [2:40:19<33:48,  4.17entity/s]   

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)
Rate limit hit. Retrying in 5 seconds... (Attempt 2/3)


Processing Superclasses:  82%|████████▏ | 38898/47171 [2:41:10<1:04:46,  2.13entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  82%|████████▏ | 38899/47171 [2:41:16<4:59:27,  2.17s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  82%|████████▏ | 38906/47171 [2:41:24<1:21:25,  1.69entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  82%|████████▏ | 38911/47171 [2:41:30<1:33:13,  1.48entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  82%|████████▏ | 38914/47171 [2:41:36<2:40:02,  1.16s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  83%|████████▎ | 39031/47171 [2:42:00<22:11,  6.11entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  83%|████████▎ | 39117/47171 [2:42:19<12:37, 10.63entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  83%|████████▎ | 39121/47171 [2:42:26<1:51:42,  1.20entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  88%|████████▊ | 41321/47171 [2:49:07<19:48,  4.92entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  88%|████████▊ | 41322/47171 [2:49:13<2:35:40,  1.60s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  88%|████████▊ | 41327/47171 [2:49:19<1:17:53,  1.25entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  88%|████████▊ | 41334/47171 [2:49:27<45:51,  2.12entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  88%|████████▊ | 41338/47171 [2:49:33<1:13:14,  1.33entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  88%|████████▊ | 41342/47171 [2:49:39<1:21:37,  1.19entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  88%|████████▊ | 41355/47171 [2:49:47<26:18,  3.68entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  88%|████████▊ | 41366/47171 [2:49:56<25:26,  3.80entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  88%|████████▊ | 41720/47171 [2:51:11<25:33,  3.55entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  88%|████████▊ | 41722/47171 [2:51:16<2:01:40,  1.34s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  88%|████████▊ | 41725/47171 [2:51:23<2:10:05,  1.43s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  89%|████████▊ | 41847/47171 [2:51:51<31:42,  2.80entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  89%|████████▊ | 41849/47171 [2:51:57<2:14:24,  1.52s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  89%|████████▊ | 41863/47171 [2:52:06<17:25,  5.08entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  89%|████████▉ | 41900/47171 [2:52:21<23:13,  3.78entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  89%|████████▉ | 41926/47171 [2:52:34<19:17,  4.53entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  93%|█████████▎| 43841/47171 [2:59:06<20:05,  2.76entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)
Rate limit hit. Retrying in 5 seconds... (Attempt 2/3)


Processing Superclasses:  93%|█████████▎| 43859/47171 [2:59:21<13:30,  4.08entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  93%|█████████▎| 43877/47171 [2:59:30<15:17,  3.59entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  99%|█████████▉| 46848/47171 [3:09:44<01:36,  3.35entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  99%|█████████▉| 46853/47171 [3:09:51<03:34,  1.48entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses: 100%|██████████| 47171/47171 [3:11:11<00:00,  4.11entity/s]


{'Q64618013': ['Q8362',
  'Q93184',
  'Q178659',
  'Q3305213',
  'Q116691808',
  'Q8205328',
  'Q110304307',
  'Q478798',
  'Q4502142',
  'Q6575414',
  'Q16686448',
  'Q118870638',
  'Q838948',
  'Q110910970',
  'Q111907565',
  'Q223557',
  'Q386724',
  'Q4406616',
  'Q53617407',
  'Q17537576',
  'Q488383',
  'Q123410745',
  'Q35120',
  'Q15621286',
  'Q53617489',
  'Q246672',
  'Q7048977',
  'Q103940464'],
 'Q64617600': ['Q8362',
  'Q93184',
  'Q178659',
  'Q3305213',
  'Q116691808',
  'Q8205328',
  'Q110304307',
  'Q478798',
  'Q4502142',
  'Q6575414',
  'Q16686448',
  'Q118870638',
  'Q838948',
  'Q110910970',
  'Q111907565',
  'Q223557',
  'Q386724',
  'Q4406616',
  'Q53617407',
  'Q17537576',
  'Q488383',
  'Q123410745',
  'Q35120',
  'Q15621286',
  'Q53617489',
  'Q246672',
  'Q7048977',
  'Q103940464'],
 'Q56682821': ['Q87167',
  'Q49848',
  'Q18593264',
  'Q110304307',
  'Q286583',
  'Q15621286',
  'Q37866906',
  'Q386724',
  'Q2342494',
  'Q4502142',
  'Q16686448',
  'Q488383'

In [235]:

from tqdm import tqdm


tables_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR3/tables/"
cea_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR3/gt/cea.csv'
cta_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR3/gt/cta.csv'


os.listdir(tables_path)
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df_cea = pd.read_csv(cea_file, header=None)
df_cea["key"] = df_cea[0] + " " + df_cea[1].astype(str) + " " + df_cea[2].astype(str)
df_cea["key_col"] = df_cea[0] + " " + df_cea[2].astype(str)



cea_values_dict = dict(zip(df_cea[3], df_cea["key_col"].values))


df_cta = pd.read_csv(cta_file, header=None)
df_cta["key"] = df_cta[0] + " " + df_cta[1].astype(str)
cta_values_dict = dict(zip(df_cta["key"].values, df_cta[2].values))


# Find the intersection of keys between the two dictionaries
common_keys = set(cea_values_dict.values()).intersection(cta_values_dict.keys())

# Create a new dictionary with combined values
combined_dict = {}
for k,v in cea_values_dict.items():
    try:
        combined_dict[k] = cta_values_dict[v]
    except:
        pass


WD_query_type = {}

# Wrap the values of the dictionary in tqdm for a progress bar
for key, value in tqdm(combined_dict.items(), desc="Processing items"):
    key = key.split('/')[-1]
    result = None
    item_id = value.split('/')[-1]

    WD_query_type[key] = item_id  # Store the result (or None if no non-empty list found)

with open('HTR3_WD_query_type.json', 'w') as json_file:
    json.dump(WD_query_type, json_file, indent=4)

Processing items: 100%|██████████| 47171/47171 [00:00<00:00, 312960.92it/s]


In [153]:

def make_json_serializable(obj):
    """
    Recursively converts non-serializable objects in the dictionary
    to JSON-serializable formats.
    """
    if isinstance(obj, set):
        return list(obj)  # Convert sets to lists
    elif isinstance(obj, dict):
        return {k: make_json_serializable(v) for k, v in obj.items()}  # Recurse for nested dictionaries
    elif isinstance(obj, list):
        return [make_json_serializable(i) for i in obj]  # Recurse for lists
    else:
        return obj  # Return the object if it's already serializable

# Convert subsample to a JSON-serializable format
serializable_subsample = make_json_serializable(ext_WD_query_type)

In [154]:
with open('ext_WD_query_type.json', 'w') as json_file:
    json.dump(serializable_subsample, json_file, indent=4)

## NER type vs NER type

In [248]:
json_file_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/work/_HTR2/HTR2_NER_query_type.json"

with open(json_file_path, "r") as file:
    HTR2_type = json.load(file)


tables_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/tables/"
cea_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/gt/cea.csv'
cta_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/gt/cta.csv'


os.listdir(tables_path)
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df_cea = pd.read_csv(cea_file, header=None)
df_cea["key"] = df_cea[0] + " " + df_cea[1].astype(str) + " " + df_cea[2].astype(str)
df_cea["key_col"] = df_cea[0] + " " + df_cea[2].astype(str)
cea_values_dict = dict(zip(df_cea["key_col"].values, df_cea[3].values))

cea_keys_set = set(df_cea["key"].values)
cea_values_dict_cell = dict(zip(df_cea["key"].values, df_cea[3].values))

# Function to process a single table file
def process_table_file(table_file):
    try:
        table_name = os.path.splitext(os.path.basename(table_file))[0]
        df = pd.read_csv(table_file)
        qid_to_value = {}

        for row in range(df.shape[0]):
            for col in range(df.shape[1]):
                key = f"{table_name} {row+1} {col}"
                if key in cea_keys_set:
                    cell_value = df.iloc[row, col]
                    qid = cea_values_dict_cell[key].split('/')[-1]  # Extract the QID from the URL
                    qid_to_value[qid] = cell_value
                    break  # Exit inner loop early as only one match per row/col is needed

        return qid_to_value
    except Exception as e:
        logging.error(f"Error processing {table_file}: {e}")
        return {}

# List of table files
table_files = [os.path.join(tables_path, table) for table in os.listdir(tables_path)]

# Process tables sequentially
HTR2_id_to_name = {}
for table_file in tqdm(table_files, desc="Processing tables"):
    local_key_to_cell = process_table_file(table_file)
    HTR2_id_to_name.update(local_key_to_cell)

Processing tables:   0%|          | 0/2692 [00:00<?, ?it/s]2025-01-18 10:34:12,914 - ERROR - Error processing C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/tables/._00CU42PU.csv: 'utf-8' codec can't decode byte 0xa2 in position 37: invalid start byte
2025-01-18 10:34:12,917 - ERROR - Error processing C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/tables/._00CYT0VB.csv: 'utf-8' codec can't decode byte 0xa2 in position 37: invalid start byte
2025-01-18 10:34:12,921 - ERROR - Error processing C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/tables/._00DZN9U3.csv: 'utf-8' codec can't decode byte 0xa2 in position 37: invalid start byte
2025-01-18 10:34:12,925 - ERROR - Error processing C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/tables/._00EDPHR0.csv: 'utf-8' codec can't decode byte 0xa2 in position 37: invalid star

In [254]:
def get_query(name, value):
    name = str(name).replace('"', ' ')
    if value is not None:
        # Soft filtering constraint
        query_dict = {
            "query": {
                "bool": {
                    "must": [
                        {"match": {"name": {"query": name, "boost": 2.0}}}
                    ],
                    "should": [
                        {"term": {"NERtype": value}}
                    ]
                }
            }
        }
        params = {
            'name': name,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 50,
            'query': json.dumps(query_dict),  # Convert the query dictionary to a JSON string
            'sort': [
                '{"popularity": {"order": "desc"}}'
            ]
        }
    
    return params

url = 'https://lamapi.hel.sintef.cloud/entity/labels?token=lamapi_demo_2023'

# Define the headers
headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json'
}

queries = []
key_to_cell_sample = {}
for id, name in tqdm(HTR2_id_to_name.items()):
    if id in HTR2_type:
        types_list = HTR2_type[id]

        ########################################################
        ##  modificare se types_list è una lista di tipi
        ########################################################
    
        query = get_query(name, types_list)

        queries.append((query, id, types_list))
        if len(queries) == 4000:
            break


 14%|█▍        | 4012/28030 [00:00<00:00, 60249.22it/s]


In [255]:
queries

[({'name': 'Royal Gwent Hospital Library',
   'token': 'lamapi_demo_2023',
   'kg': 'wikidata',
   'limit': 50,
   'query': '{"query": {"bool": {"must": [{"match": {"name": {"query": "Royal Gwent Hospital Library", "boost": 2.0}}}], "should": [{"term": {"NERtype": "ORG"}}]}}}',
   'sort': ['{"popularity": {"order": "desc"}}']},
  'Q50228603',
  'ORG'),
 ({'name': 'Rowland Isaac Library',
   'token': 'lamapi_demo_2023',
   'kg': 'wikidata',
   'limit': 50,
   'query': '{"query": {"bool": {"must": [{"match": {"name": {"query": "Rowland Isaac Library", "boost": 2.0}}}], "should": [{"term": {"NERtype": "ORG"}}]}}}',
   'sort': ['{"popularity": {"order": "desc"}}']},
  'Q50228595',
  'ORG'),
 ({'name': "Miners' Library, Ysbyty Ystrad Fawr",
   'token': 'lamapi_demo_2023',
   'kg': 'wikidata',
   'limit': 50,
   'query': '{"query": {"bool": {"must": [{"match": {"name": {"query": "Miners\' Library, Ysbyty Ystrad Fawr", "boost": 2.0}}}], "should": [{"term": {"NERtype": "ORG"}}]}}}',
   'sort':

In [None]:
import aiohttp
import asyncio
import backoff
import nest_asyncio
import random
from tqdm import tqdm
import numpy as np

# Assume queries is a list of tuples [(param1, id1), (param2, id2), ...]

failed_queries = {}
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=10, 
    max_time=400
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        # Convert all params to str, int, or float
        #params = {k: (int(v) if isinstance(v, np.integer) else str(v)) for k, v in params.items()}
        async with session.get(url, params=params, headers=headers, timeout=50) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except asyncio.TimeoutError:
                print(f"Request timed out for params: {params}")
                return []  # Return an empty list to handle the timeout gracefully
            except aiohttp.ClientError as e:
                print(f"ClientError for params {params}: {str(e)}")
                return []
            except Exception as e:
                print(f"Unexpected error for params {params}: {str(e)}")
                return []
async def process_item(session, url, id, headers, params, semaphore, pbar):
    try:
        data = await fetch(session, url, params, headers, semaphore)
    except aiohttp.ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{id}'")
            asyncio.get_event_loop().call_soon_threadsafe(pbar.update, 1)
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0


    ###################################################
    ## scandisco il candidate set in cui ho già fatto 
    ## l'overlapping dei tipi
    ###################################################
    if data:
        for item in data:
            #print(f"{id} vs {item.get('id')}")
            if id == item.get('id'):
                asyncio.get_event_loop().call_soon_threadsafe(pbar.update, 1)
                pos_score = item.get('pos_score', 0)
                if pos_score:
                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                else:
                    mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                return mrr_increment, 1

    return 0, 0

async def main(queries, url, pbar, failed_queries):
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0

    async with aiohttp.ClientSession() as session:
        tasks = []
        for param, id, _ in queries:
            tasks.append(process_item(session, url, id, headers, param, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)

        ###  overlapping Q e T_i
        
        for (mrr_increment, count), (param, id, item_NERtype) in zip(results, queries):
            if mrr_increment == 0 and count == 0:
                failed_queries[id] = (id, item_NERtype)
            else:
                m_mrr += mrr_increment
                cont_el += count

        asyncio.get_event_loop().call_soon_threadsafe(pbar.close)

    print(f"Coverage of 2T: {cont_el / len(queries)}")
    print(f"Measure Reciprocal Rank of 2T: {m_mrr / len(queries)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(queries))
        asyncio.run(main(queries, url, pbar, failed_queries))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(queries, url, pbar, failed_queries))


In [14]:
# for each WD type inserted from the user (WD_query_type taken from CEA) now we retrieve the extended WD types until the root
# given the list of the ext_types we do the overlap with the ext_types taken from lamAPI with HARD and SOFT query

ext_query_types = []

for entity_id, type_str in WD_query_type.items():
    #print(f"{type_str}: {get_type_id(type_str)}")
    
    entity_name = key_to_cell[list(WD_query_type.keys())[0]]
    ext_query_types += list(set(retrieve_superclasses(get_type_id(type_str))))
    
    # query a lamapi dove specifico nel filtro il tipo

    # entity_id è il ground truth
    WD_candidate_types = WD_types(entity_id)  # WD_types() interroga il servizio types() ma forse è sbagliato (da implementare lato server non client)
print(ext_query_types)


KeyboardInterrupt: 

In [15]:
ext_query_types

['Q53617489',
 'Q98119401',
 'Q16334295',
 'Q20937557',
 'Q2897903',
 'Q2217301',
 'Q7725310',
 'Q1002697',
 'Q117208263',
 'Q732577',
 'Q24229398',
 'Q16887380',
 'Q15621286',
 'Q11032',
 'Q1554231',
 'Q106668099',
 'Q99527517',
 'Q43229',
 'Q3523102',
 'Q286583',
 'Q1193236',
 'Q121182',
 'Q61961344',
 'Q7048977',
 'Q58415929',
 'Q1639378',
 'Q49848',
 'Q28877',
 'Q2424752',
 'Q131085629',
 'Q488383',
 'Q58778',
 'Q117208269',
 'Q12774177',
 'Q35825432',
 'Q115668308',
 'Q854457',
 'Q26907166',
 'Q17538423',
 'Q31464082',
 'Q16686448',
 'Q107435521',
 'Q47461344',
 'Q37866906',
 'Q17172633',
 'Q106559804',
 'Q28314507',
 'Q386724',
 'Q17537576',
 'Q119648442',
 'Q234460',
 'Q5127848',
 'Q35120',
 'Q17489659',
 'Q340169',
 'Q16334298',
 'Q1261026',
 'Q16889133',
 'Q103940464',
 'Q11033',
 'Q11474',
 'Q53617489',
 'Q53617407',
 'Q193395',
 'Q28555911',
 'Q2897903',
 'Q9158768',
 'Q6671777',
 'Q10683158',
 'Q251473',
 'Q99527517',
 'Q96791170',
 'Q337060',
 'Q8205328',
 'Q104450446',
 '

## WD type vs NER type

In [None]:
# for each WD type inserted from the user (WD_WD_query_type taken from CEA) now we retrieve the extended WD types until the root
# given the list of the ext_types we do the overlap with the ext_types taken from lamAPI with HARD and SOFT query

False

In [25]:
retrieve_superclasses("Q12299841")

{'Q12299841': 'cricketer',
 'Q2066131': 'athlete',
 'Q18536342': 'competitive player',
 'Q50995749': 'sportsperson',
 'Q4197743': 'player',
 'Q215627': 'person',
 'Q830077': 'subject',
 'Q795052': 'individual',
 'Q3778211': 'legal person',
 'Q53617489': 'independent continuant',
 'Q7239': 'organism',
 'Q24229398': 'agent',
 'Q106559804': 'person or organization',
 'Q103940464': 'continuant',
 'Q223557': 'physical object',
 'Q66394244': 'physical anatomical entity',
 'Q4406616': 'concrete object',
 'Q53617407': 'material entity',
 'Q27043950': 'anatomical entity',
 'Q488383': 'object',
 'Q35120': 'entity'}

In [8]:
cta_values_dict

{'58891288_0_1117541047012405958 1': 'http://dbpedia.org/ontology/Film',
 '8468806_0_4382447409703007384 1': 'http://dbpedia.org/ontology/Lake',
 '50245608_0_871275842592178099 0': 'http://dbpedia.org/ontology/Film',
 '14067031_0_559833072073397908 1': 'http://dbpedia.org/ontology/Language',
 '8286121_0_8471791395229161598 0': 'http://dbpedia.org/ontology/Country',
 '39759273_0_1427898308030295194 1': 'http://dbpedia.org/ontology/Film',
 '14380604_4_3329235705746762392 1': 'http://dbpedia.org/ontology/Company',
 '20135078_0_7570343137119682530 3': 'http://dbpedia.org/ontology/Person',
 '29414811_6_8221428333921653560 1': 'http://dbpedia.org/ontology/VideoGame',
 '34041816_1_4749054164534706977 2': 'http://dbpedia.org/ontology/City',
 '14067031_0_559833072073397908 7': 'http://dbpedia.org/ontology/Currency',
 '71137051_0_8039724067857124984 0': 'http://dbpedia.org/ontology/Bird',
 '29414811_2_4773219892816395776 1': 'http://dbpedia.org/ontology/VideoGame',
 '99070098_0_20748727413026969

In [11]:
cea_values_dict

{'50245608_0_871275842592178099 0': 'https://www.wikidata.org/entity/Q46551',
 '22864497_0_8632623712684511496 0': 'https://www.wikidata.org/entity/Q6738126',
 '66009064_0_9148652238372261251 0': 'https://www.wikidata.org/entity/Q1094988',
 '21362676_0_6854186738074119688 1': 'https://www.wikidata.org/entity/Q463832',
 '40534006_0_4617468856744635526 1': 'https://www.wikidata.org/entity/Q452590',
 '36102169_0_7739454799295072814 2': 'https://www.wikidata.org/entity/Q1978200',
 '53822652_0_5767892317858575530 1': 'https://www.wikidata.org/entity/Q200396',
 '60319454_0_3938426910282115527 0': 'https://www.wikidata.org/entity/Q69581',
 '8468806_0_4382447409703007384 1': 'https://www.wikidata.org/entity/Q1546823',
 '33401079_0_9127583903019856402 0': 'https://www.wikidata.org/entity/Q154538',
 '99070098_0_2074872741302696997 1': 'https://www.wikidata.org/entity/Q4920755',
 '50270082_0_444360818941411589 1': 'https://www.wikidata.org/entity/Q114468',
 '29414811_12_251152470253168163 1': 'ht