In [142]:
import os
import pandas as pd
from tqdm import tqdm
import logging
import requests
from SPARQLWrapper import SPARQLWrapper, JSON
import time
import re
from collections import Counter

In [143]:
import json
from requests import get

def get_wikidata_item_tree_item_idsSPARQL(root_items, forward_properties=None, backward_properties=None):
    """Return ids of WikiData items, which are in the tree spanned by the given root items and claims relating them
        to other items.

    :param root_items: iterable[int] One or multiple item entities that are the root elements of the tree
    :param forward_properties: iterable[int] | None property-claims to follow forward; that is, if root item R has
        a claim P:I, and P is in the list, the search will branch recursively to item I as well.
    :param backward_properties: iterable[int] | None property-claims to follow in reverse; that is, if (for a root
        item R) an item I has a claim P:R, and P is in the list, the search will branch recursively to item I as well.
    :return: iterable[int]: List with ids of WikiData items in the tree
    """

    query = '''PREFIX wikibase: <http://wikiba.se/ontology#>
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>'''
    if forward_properties:
        query +='''SELECT ?WD_id WHERE {
                  ?tree0 (wdt:P%s)* ?WD_id .
                  BIND (wd:%s AS ?tree0)
                  }'''%( ','.join(map(str, forward_properties)),','.join(map(str, root_items)))
    elif backward_properties:
        query+='''SELECT ?WD_id WHERE {
                    ?WD_id (wdt:P%s)* wd:Q%s .
                    }'''%(','.join(map(str, backward_properties)), ','.join(map(str, root_items)))
    #print(query)

    url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    data = get(url, params={'query': query, 'format': 'json'}).json()
    
    ids = []
    for item in data['results']['bindings']:
        this_id=item["WD_id"]["value"].split("/")[-1].lstrip("Q")
        #print(item)
        try:
            this_id = int(this_id)
            ids.append(this_id)
            #print(this_id)
        except ValueError:
            #print("exception")
            continue
    return ids


try:
    organization_subclass = get_wikidata_item_tree_item_idsSPARQL([43229], backward_properties=[279])
except json.decoder.JSONDecodeError:
    organization_subclass = []

try:
    country_subclass = get_wikidata_item_tree_item_idsSPARQL([6256], backward_properties=[279])
except json.decoder.JSONDecodeError:
    country_subclass = []

try:
    city_subclass = get_wikidata_item_tree_item_idsSPARQL([515], backward_properties=[279])
except json.decoder.JSONDecodeError:
    city_subclass = []

try:
    capitals_subclass = get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])
except json.decoder.JSONDecodeError:
    capitals_subclass = []

try:
    admTerr_subclass = get_wikidata_item_tree_item_idsSPARQL([15916867], backward_properties=[279])
except json.decoder.JSONDecodeError:
    admTerr_subclass = []

try:
    family_subclass = get_wikidata_item_tree_item_idsSPARQL([17350442], backward_properties=[279])
except json.decoder.JSONDecodeError:
    family_subclass = []

try:
    sportLeague_subclass = get_wikidata_item_tree_item_idsSPARQL([623109], backward_properties=[279])
except json.decoder.JSONDecodeError:
    sportLeague_subclass = []

try:
    venue_subclass = get_wikidata_item_tree_item_idsSPARQL([8436], backward_properties=[279])
except json.decoder.JSONDecodeError:
    venue_subclass = []

# Removing overlaps for organization_subclass
organization_subclass = list(set(organization_subclass) - set(country_subclass) - set(city_subclass) - set(capitals_subclass) - set(admTerr_subclass) - set(family_subclass) - set(sportLeague_subclass) - set(venue_subclass))

try:
    geolocation_subclass = get_wikidata_item_tree_item_idsSPARQL([2221906], backward_properties=[279])
except json.decoder.JSONDecodeError:
    geolocation_subclass = []

try:
    food_subclass = get_wikidata_item_tree_item_idsSPARQL([2095], backward_properties=[279])
except json.decoder.JSONDecodeError:
    food_subclass = []

try:
    edInst_subclass = get_wikidata_item_tree_item_idsSPARQL([2385804], backward_properties=[279])
except json.decoder.JSONDecodeError:
    edInst_subclass = []

try:
    govAgency_subclass = get_wikidata_item_tree_item_idsSPARQL([327333], backward_properties=[279])
except json.decoder.JSONDecodeError:
    govAgency_subclass = []

try:
    intOrg_subclass = get_wikidata_item_tree_item_idsSPARQL([484652], backward_properties=[279])
except json.decoder.JSONDecodeError:
    intOrg_subclass = []

try:
    timeZone_subclass = get_wikidata_item_tree_item_idsSPARQL([12143], backward_properties=[279])
except json.decoder.JSONDecodeError:
    timeZone_subclass = []

# Removing overlaps for geolocation_subclass
geolocation_subclass = list(set(geolocation_subclass) - set(food_subclass) - set(edInst_subclass) - set(govAgency_subclass) - set(intOrg_subclass) - set(timeZone_subclass))



In [144]:
def split_entity_name(entity_name):
    """
    Splits the entity name into separate words if it has more than 2 uppercase letters.
    
    Args:
        entity_name (str): The input string to check and potentially split (e.g., 'VideoGame').
    
    Returns:
        str: The modified string with words split by spaces if necessary.
    """
    # Count uppercase letters
    uppercase_count = sum(1 for char in entity_name if char.isupper())
    
    # If more than 2 uppercase letters, split before every uppercase letter
    if uppercase_count >= 2:
        return re.sub(r'(?<!^)(?=[A-Z])', ' ', entity_name)
    
    return entity_name

def get_type_id(entity_name):
    """
    Queries the API to retrieve the QID of an entity based on its name.

    Args:
        entity_name (str): The name of the entity (e.g., 'Video Game').

    Returns:
        str: The QID associated with the given entity name, or None if an error occurs or no QID is found.
    """
    
    entity_name_split = split_entity_name(entity_name)

    url = f"https://lamapi.hel.sintef.cloud/lookup/entity-retrieval"
    params = {
        "name": entity_name_split,
        "kind": "type",
        "token": "lamapi_demo_2023"
    }
    headers = {
        "accept": "application/json"
    }

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()  # Raise an error for HTTP issues
        data = response.json()
        # Assuming the API returns the QID directly or within a key, adjust if structure differs
        #return max(data, key=lambda x: x.get('pos_score', 0))
        return data[0]['id']
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None


In [145]:
def retrieve_superclasses(entity_id):
    """
    Retrieve all superclasses of a given Wikidata entity ID.

    Args:
        entity_id (str): The ID of the entity (e.g., "Q207784").

    Returns:
        dict: A dictionary where keys are superclass IDs, and values are their labels.
    """
    # Define the SPARQL endpoint and query
    endpoint_url = "https://query.wikidata.org/sparql"
    query = f"""
    SELECT ?superclass ?superclassLabel WHERE {{
      wd:{entity_id} (wdt:P279)* ?superclass.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """

    # Function to query the SPARQL endpoint with retries
    def query_wikidata(sparql_client, query, retries=3, delay=5):
        for attempt in range(retries):
            try:
                sparql_client.setQuery(query)
                sparql_client.setReturnFormat(JSON)
                results = sparql_client.query().convert()
                return results
            except Exception as e:
                if "429" in str(e):  # Handle Too Many Requests error
                    print(f"Rate limit hit. Retrying in {delay} seconds... (Attempt {attempt + 1}/{retries})")
                    time.sleep(delay)
                else:
                    print(f"An error occurred: {e}")
                    break
        return None

    # Set up the SPARQL client
    sparql = SPARQLWrapper(endpoint_url)

    # Execute the query with retries
    results = query_wikidata(sparql, query)

    # Process results and return as a dictionary
    if results:
        superclass_dict = {}
        for result in results["results"]["bindings"]:
            superclass_id = result["superclass"]["value"].split("/")[-1]  # Extract entity ID from the URI
            label = result["superclassLabel"]["value"]
            superclass_dict[superclass_id] = label
        return superclass_dict
    else:
        print("Failed to retrieve data after multiple attempts.")
        return {}


In [146]:
def WD_types(entity_id):
    """
    Queries the API to retrieve the list of type IDs for a given entity ID.

    Args:
        entity_id (str): The Wikidata ID of the entity (e.g., 'Q30').

    Returns:
        list: A list of type IDs associated with the given entity, or an empty list if an error occurs.
    """

    url = "https://lamapi.hel.sintef.cloud/entity/types?token=lamapi_demo_2023"
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    payload = {"json": [entity_id]}
    
    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()  # Raise an error for HTTP issues
        ids = response.json()
        return ids[entity_id]['types']['P31']  # Parse and return the JSON response
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []
    except KeyError as e:
        # Handle missing keys in the JSON response
        return []

In [169]:
person_subclass=get_wikidata_item_tree_item_idsSPARQL([5], backward_properties=[279])

In [170]:
# hai un solo NER type per entità perchè lato query type hai associato un solo 
# explicit_WD_type da mappare


def NER_types(entity_id):

    """
    Given an entity it retrieves the list of the associated NER types making
    the NER type extension of all the WD_types associated to it.
    """
    if not entity_id:
        return None  

    match = re.search(r'Q(\d+)', entity_id)
    numeric_id = int(match.group(1))

    NERtype = []
    # Classify NER types
    if numeric_id in person_subclass:
        return 'PERS'
    elif numeric_id in geolocation_subclass:
        return 'LOC'
    elif numeric_id in organization_subclass:
        return 'ORG'
    else:
        return 'OTHERS'
    

In [148]:
def extended_WD_types(entity_id):

    """
    Given an entity it retrieves the list of the associated extended WD types making
    the type extension of all the WD_types associated to it.
    """
        
    if not entity_id:
        return None

    return list(retrieve_superclasses(entity_id).keys())
    

## Dataset Reading

### Babak ORG dataset

In [None]:
path = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Babak_Annotations_Github/alligator_annotations.json'
with open(path, 'r') as json_file:
    dictionary = json.load(json_file)


key_to_cell = {}
for el in dictionary['rows']:
    if el['ids'][0] != "NIL":
        key_to_cell[el['data'][0]] = el['ids'][0]

# Initialize an empty dictionary for NER_query_type
NER_query_type = {}

# Iterate over WD_query_type.keys() with a tqdm progress bar
for entity_name, entity_id in tqdm(key_to_cell.items(), desc="Processing Entities", unit="entity"):

    NER_query_type[entity_id] = "ORG"

with open('Babak_NER_query_type.json', 'w') as json_file:
    json.dump(NER_query_type, json_file, indent=4)



Processing Entities: 100%|██████████| 454/454 [00:00<00:00, 301156.73entity/s]


### Round1 and Round3 datasets

In [187]:
tables_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round1_T2D/tables/"
cea_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_WD.csv'
cta_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round1_T2D/gt/CTA_Round1_gt.csv'


os.listdir(tables_path)
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df_cea = pd.read_csv(cea_file, header=None)
df_cea["key"] = df_cea[0] + " " + df_cea[1].astype(str) + " " + df_cea[2].astype(str)
df_cea["key_col"] = df_cea[0] + " " + df_cea[2].astype(str)
cea_values_dict = dict(zip(df_cea["key_col"].values, df_cea[3].values))


df_cta = pd.read_csv(cta_file, header=None)
df_cta["key"] = df_cta[0] + " " + df_cta[1].astype(str)
cta_values_dict = dict(zip(df_cta["key"].values, df_cta[2].values))


# Find the intersection of keys between the two dictionaries
common_keys = set(cea_values_dict.keys()).intersection(cta_values_dict.keys())

# Create a new dictionary with combined values
combined_dict = {key: [cea_values_dict[key], cta_values_dict[key]] for key in common_keys}


In [177]:
DBpedia_results = []

In [180]:
DBpedia_results.extend(value.split('/')[-1] for value in cta_values_dict.values())


In [129]:
import requests
from tqdm import tqdm

DBpedia_results = list(set(DBpedia_results))
DB_mapping = {}

def map_dbpedia_to_wikidata(dbpedia_type):
    """
    Maps a DBpedia type to its corresponding Wikidata ID using the DBpedia API.

    Args:
        dbpedia_type (str): The DBpedia type to map.

    Returns:
        str: The corresponding Wikidata ID if found, else None.
    """
    endpoint = "http://dbpedia.org/sparql"
    query = f"""
    SELECT DISTINCT ?wikidata_concept WHERE {{
        dbr:{dbpedia_type} owl:sameAs ?wikidata_concept
    }}
    LIMIT 50
    """

    headers = {"Accept": "application/sparql-results+json"}

    try:
        response = requests.get(endpoint, params={"query": query}, headers=headers)
        response.raise_for_status()
        results = response.json()["results"]["bindings"]
        for el in results:
            if el["wikidata_concept"]["value"].split("//")[1].split("/")[0] == "www.wikidata.org":
                return el["wikidata_concept"]["value"]
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error querying DBpedia API: {e}")
        return None

for el in tqdm(DBpedia_results, desc="Mapping DBpedia to Wikidata"):
    if map_dbpedia_to_wikidata(el) is not None:
        DB_mapping[el] = map_dbpedia_to_wikidata(el)


Mapping DBpedia to Wikidata: 100%|██████████| 274/274 [01:51<00:00,  2.46it/s]


In [131]:
len(DB_mapping)

137

In [185]:
cea_keys_set = set(df_cea["key"].values)
cea_values_dict_cell = dict(zip(df_cea["key"].values, df_cea[3].values))

# Function to process a single table file
def process_table_file(table_file):
    try:
        table_name = os.path.splitext(os.path.basename(table_file))[0]
        df = pd.read_csv(table_file)
        qid_to_value = {}

        for row in range(df.shape[0]):
            for col in range(df.shape[1]):
                key = f"{table_name} {row+1} {col}"
                if key in cea_keys_set:
                    cell_value = df.iloc[row, col]
                    qid = cea_values_dict_cell[key].split('/')[-1]  # Extract the QID from the URL
                    qid_to_value[qid] = cell_value
                    break  # Exit inner loop early as only one match per row/col is needed

        return qid_to_value
    except Exception as e:
        logging.error(f"Error processing {table_file}: {e}")
        return {}

# List of table files
table_files = [os.path.join(tables_path, table) for table in os.listdir(tables_path)]

# Process tables sequentially
key_to_cell = {}
for table_file in tqdm(table_files, desc="Processing tables"):
    local_key_to_cell = process_table_file(table_file)
    key_to_cell.update(local_key_to_cell)

Processing tables: 100%|██████████| 2161/2161 [01:14<00:00, 28.93it/s]


### Others datasets

In [48]:

tables_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round4_2020/tables/"
cea_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round4_2020/gt/cea.csv'
cta_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round4_2020/gt/cta.csv'


os.listdir(tables_path)
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df_cea = pd.read_csv(cea_file, header=None)
df_cea["key"] = df_cea[0] + " " + df_cea[1].astype(str) + " " + df_cea[2].astype(str)
df_cea["key_col"] = df_cea[0] + " " + df_cea[2].astype(str)
cea_values_dict = dict(zip(df_cea["key_col"].values, df_cea[3].values))


df_cta = pd.read_csv(cta_file, header=None)
df_cta["key"] = df_cta[0] + " " + df_cta[1].astype(str)
cta_values_dict = dict(zip(df_cta["key"].values, df_cta[2].values))


# Find the intersection of keys between the two dictionaries
common_keys = set(cea_values_dict.keys()).intersection(cta_values_dict.keys())

# Create a new dictionary with combined values
combined_dict = {key: [cea_values_dict[key], cta_values_dict[key]] for key in common_keys}


In [None]:
cea_keys_set = set(df_cea["key"].values)
cea_values_dict_cell = dict(zip(df_cea["key"].values, df_cea[3].values))

# Function to process a single table file
def process_table_file(table_file):
    try:
        table_name = os.path.splitext(os.path.basename(table_file))[0]
        df = pd.read_csv(table_file)
        qid_to_value = {}

        for row in range(df.shape[0]):
            for col in range(df.shape[1]):
                key = f"{table_name} {row+1} {col}"
                if key in cea_keys_set:
                    cell_value = df.iloc[row, col]
                    qid = cea_values_dict_cell[key].split('/')[-1]  # Extract the QID from the URL
                    qid_to_value[qid] = cell_value
                    break  # Exit inner loop early as only one match per row/col is needed

        return qid_to_value
    except Exception as e:
        logging.error(f"Error processing {table_file}: {e}")
        return {}

# List of table files
table_files = [os.path.join(tables_path, table) for table in os.listdir(tables_path)]

# Process tables sequentially
key_to_cell = {}
for table_file in tqdm(table_files, desc="Processing tables"):
    local_key_to_cell = process_table_file(table_file)
    key_to_cell.update(local_key_to_cell)

In [189]:
len(WD_query_types)

45

In [188]:
WD_query_types = {}
for value in combined_dict.values():
    if value[1].split('/')[-1] in DB_mapping.keys():
        WD_query_types[value[0].split('/')[-1]] = DB_mapping[value[1].split('/')[-1]].split('/')[-1]
    
WD_query_types

{'Q1872524': 'Q11032',
 'Q69581': 'Q729',
 'Q1546823': 'Q23397',
 'Q72538': 'Q901',
 'Q2384003': 'Q5113',
 'Q30315633': 'Q11032',
 'Q503046': 'Q11424',
 'Q639481': 'Q11424',
 'Q487907': 'Q783794',
 'Q6476027': 'Q23397',
 'Q135465': 'Q11424',
 'Q459477': 'Q783794',
 'Q2078054': 'Q756',
 'Q14429': 'Q23397',
 'Q5782393': 'Q756',
 'Q154538': 'Q215627',
 'Q817': 'Q6256',
 'Q575014': 'Q11424',
 'Q208875': 'Q11032',
 'Q25193623': 'Q11424',
 'Q1033': 'Q6256',
 'Q7626741': 'Q12299841',
 'Q227': 'Q6256',
 'Q199203': 'Q729',
 'Q200396': 'Q11424',
 'Q5181893': 'Q23397',
 'Q193439': 'Q1248784',
 'Q7194441': 'Q8502',
 'Q725552': 'Q11424',
 'Q181086': 'Q11424',
 'Q46551': 'Q11424',
 'Q4920755': 'Q8502',
 'Q471169': 'Q11424',
 'Q128518': 'Q11424',
 'Q831445': 'Q783794',
 'Q463832': 'Q11424',
 'Q206319': 'Q8142',
 'Q83495': 'Q11424',
 'Q4390981': 'Q43115',
 'Q1011': 'Q6256',
 'Q178021': 'Q1248784',
 'Q61620': 'Q43115',
 'Q1781530': 'Q33506',
 'Q214801': 'Q11424',
 'Q104123': 'Q11424'}

In [10]:
WD_query_type = {
    value[0].split('/')[-1]: value[1].split('/')[-1]
    for value in combined_dict.values()
}

WD_query_type

{'Q42302919': 'Q604435',
 'Q27253939': 'Q27253938',
 'Q60193329': 'Q1435962',
 'Q6655': 'Q17272482',
 'Q186363': 'Q32880',
 'Q20937': 'Q4925355',
 'Q1143707': 'Q2919801',
 'Q9671': 'Q5',
 'Q30': 'Q3624078',
 'Q28121405': 'Q1059407',
 'Q34': 'Q3624078',
 'Q5268834': 'Q268592',
 'Q14545639': 'Q3679744',
 'Q18888835': 'Q2417724',
 'Q60052167': 'Q7694920',
 'Q66531600': 'Q1153690',
 'Q19652': 'Q50424085',
 'Q86084984': 'Q11446',
 'Q2269267': 'Q131436',
 'Q587161': 'Q5',
 'Q2240599': 'Q3950',
 'Q66012487': 'Q56855534',
 'Q22595594': 'Q23397',
 'Q64623388': 'Q5',
 'Q55621374': 'Q52721923',
 'Q20201263': 'Q184296',
 'Q56507068': 'Q5',
 'Q17511685': 'Q1584134',
 'Q55212121': 'Q18593264',
 'Q914148': 'Q52193405',
 'Q27055581': 'Q1674283',
 'Q1134591': 'Q744913',
 'Q38': 'Q3624078',
 'Q222': 'Q3624078',
 'Q135705': 'Q2990963',
 'Q15537': 'Q33146843',
 'Q881': 'Q3624078',
 'Q28122364': 'Q3390872',
 'Q183': 'Q3624078',
 'Q58466778': 'Q11835431',
 'Q19882906': 'Q3002150',
 'Q19982739': 'Q13433827',

In [136]:
with open('R1_WD_query_type.json', 'w') as json_file:
    json.dump(WD_types, json_file, indent=4)


In [82]:
#print(f"{type_str}: {get_type_id(type_str)}")

entity_name = key_to_cell[list(WD_query_type.keys())[0]]
ext_query_types = list(set(retrieve_superclasses("Q31")))

# query a lamapi dove specifico nel filtro il tipo

#WD_candidate_types = WD_types(entity_id)  # WD_types() interroga il servizio types() ma forse è sbagliato (da implementare lato server non client)
print(ext_query_types)

['Q31']


In [190]:
# Initialize an empty dictionary for NER_query_type
NER_query_type = {}

# Iterate over WD_query_type.keys() with a tqdm progress bar
for entity_name, entity_type in tqdm(WD_query_types.items(), desc="Processing Entities", unit="entity"):
    if entity_type is None:
        print("none")
        NER_query_type[entity_name] = None
    else:
        if entity_type in ["Q81096", "Q36180", "Q82955", "Q2066131", "Q4964182", "Q63532478", "Q12299841", "Q201788", "Q245068", "Q33231", "Q49757", "Q846750", "Q116", "Q132050", "Q373085", "Q42973", "Q11631", "Q11338576", "Q1979154", "Q215627", "Q901", "Q188094", "Q1930187", "Q483501", "Q15686806", "Q16533"]:
            NER_query_type[entity_name] = "PERS"
        else:
            NER_query_type[entity_name] = NER_types(entity_type)

with open('R1_NER_query_type.json', 'w') as json_file:
    json.dump(NER_query_type, json_file, indent=4)


# Display the resulting dictionary
NER_query_type

Processing Entities: 100%|██████████| 45/45 [00:00<00:00, 423.23entity/s]


{'Q1872524': 'ORG',
 'Q69581': 'OTHERS',
 'Q1546823': 'LOC',
 'Q72538': 'PERS',
 'Q2384003': 'OTHERS',
 'Q30315633': 'ORG',
 'Q503046': 'OTHERS',
 'Q639481': 'OTHERS',
 'Q487907': 'ORG',
 'Q6476027': 'LOC',
 'Q135465': 'OTHERS',
 'Q459477': 'ORG',
 'Q2078054': 'OTHERS',
 'Q14429': 'LOC',
 'Q5782393': 'OTHERS',
 'Q154538': 'PERS',
 'Q817': 'OTHERS',
 'Q575014': 'OTHERS',
 'Q208875': 'ORG',
 'Q25193623': 'OTHERS',
 'Q1033': 'OTHERS',
 'Q7626741': 'PERS',
 'Q227': 'OTHERS',
 'Q199203': 'OTHERS',
 'Q200396': 'OTHERS',
 'Q5181893': 'LOC',
 'Q193439': 'ORG',
 'Q7194441': 'LOC',
 'Q725552': 'OTHERS',
 'Q181086': 'OTHERS',
 'Q46551': 'OTHERS',
 'Q4920755': 'LOC',
 'Q471169': 'OTHERS',
 'Q128518': 'OTHERS',
 'Q831445': 'ORG',
 'Q463832': 'OTHERS',
 'Q206319': 'OTHERS',
 'Q83495': 'OTHERS',
 'Q4390981': 'OTHERS',
 'Q1011': 'OTHERS',
 'Q178021': 'ORG',
 'Q61620': 'OTHERS',
 'Q1781530': 'ORG',
 'Q214801': 'OTHERS',
 'Q104123': 'OTHERS'}

In [162]:
# Initialize an empty dictionary for ext_WD_query_type
ext_WD_query_type = {}

# Iterate over WD_query_type.keys() with a tqdm progress bar
for entity_name, entity_type in tqdm(WD_query_types.items(), desc="Processing Superclasses", unit="entity"):
    #print(f"{el}: #superclasses {len(extended_WD_types(el))}")
    ext_WD_query_type[entity_name] = extended_WD_types(entity_type)

with open('R3_ext_WD_query_type.json', 'w') as json_file:
    json.dump(ext_WD_query_type, json_file, indent=4)


# Display the resulting dictionary
ext_WD_query_type

Processing Superclasses:  15%|█▍        | 421/2856 [02:12<14:54,  2.72entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  16%|█▌        | 451/2856 [02:32<22:08,  1.81entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  32%|███▏      | 922/2856 [05:55<06:50,  4.71entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  34%|███▍      | 971/2856 [06:12<06:04,  5.17entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  34%|███▍      | 984/2856 [06:20<08:32,  3.65entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  36%|███▌      | 1019/2856 [06:33<06:56,  4.41entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  36%|███▌      | 1023/2856 [06:40<25:59,  1.18entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  64%|██████▎   | 1819/2856 [10:34<04:18,  4.00entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  70%|██████▉   | 1996/2856 [12:13<05:09,  2.78entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  70%|███████   | 2001/2856 [12:20<10:26,  1.36entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  88%|████████▊ | 2518/2856 [15:15<02:29,  2.26entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  89%|████████▊ | 2534/2856 [15:26<02:24,  2.23entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  89%|████████▉ | 2548/2856 [15:35<01:23,  3.67entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses: 100%|██████████| 2856/2856 [17:20<00:00,  2.75entity/s]


{'Q876441': ['Q29182',
  'Q352507',
  'Q831474',
  'Q1423891',
  'Q1638918',
  'Q4164871',
  'Q112074151',
  'Q42603',
  'Q3315492',
  'Q105200223',
  'Q214339',
  'Q2259532',
  'Q4897819',
  'Q4504549',
  'Q123241435',
  'Q17573152',
  'Q123240632',
  'Q215627',
  'Q103940464',
  'Q795052',
  'Q3778211',
  'Q35120',
  'Q7239',
  'Q24229398',
  'Q106559804',
  'Q223557',
  'Q66394244',
  'Q53617489',
  'Q4406616',
  'Q53617407',
  'Q27043950',
  'Q488383'],
 'Q6302456': ['Q571',
  'Q49848',
  'Q340169',
  'Q732577',
  'Q2424752',
  'Q286583',
  'Q15621286',
  'Q37866906',
  'Q12774177',
  'Q115668308',
  'Q488383',
  'Q16686448',
  'Q35120',
  'Q35825432',
  'Q386724',
  'Q1554231',
  'Q3523102',
  'Q17538423',
  'Q31464082',
  'Q53617489',
  'Q103940464'],
 'Q490098': ['Q7365',
  'Q27062223',
  'Q28843519',
  'Q66545292',
  'Q112826975',
  'Q104034697',
  'Q66536017',
  'Q28846468',
  'Q4406616',
  'Q488383',
  'Q712378',
  'Q66591980',
  'Q4936952',
  'Q35120',
  'Q27043948',
  'Q555

In [235]:

from tqdm import tqdm


tables_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR3/tables/"
cea_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR3/gt/cea.csv'
cta_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR3/gt/cta.csv'


os.listdir(tables_path)
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df_cea = pd.read_csv(cea_file, header=None)
df_cea["key"] = df_cea[0] + " " + df_cea[1].astype(str) + " " + df_cea[2].astype(str)
df_cea["key_col"] = df_cea[0] + " " + df_cea[2].astype(str)



cea_values_dict = dict(zip(df_cea[3], df_cea["key_col"].values))


df_cta = pd.read_csv(cta_file, header=None)
df_cta["key"] = df_cta[0] + " " + df_cta[1].astype(str)
cta_values_dict = dict(zip(df_cta["key"].values, df_cta[2].values))


# Find the intersection of keys between the two dictionaries
common_keys = set(cea_values_dict.values()).intersection(cta_values_dict.keys())

# Create a new dictionary with combined values
combined_dict = {}
for k,v in cea_values_dict.items():
    try:
        combined_dict[k] = cta_values_dict[v]
    except:
        pass


WD_query_type = {}

# Wrap the values of the dictionary in tqdm for a progress bar
for key, value in tqdm(combined_dict.items(), desc="Processing items"):
    key = key.split('/')[-1]
    result = None
    item_id = value.split('/')[-1]

    WD_query_type[key] = item_id  # Store the result (or None if no non-empty list found)

with open('HTR3_WD_query_type.json', 'w') as json_file:
    json.dump(WD_query_type, json_file, indent=4)

Processing items: 100%|██████████| 47171/47171 [00:00<00:00, 312960.92it/s]


In [153]:

def make_json_serializable(obj):
    """
    Recursively converts non-serializable objects in the dictionary
    to JSON-serializable formats.
    """
    if isinstance(obj, set):
        return list(obj)  # Convert sets to lists
    elif isinstance(obj, dict):
        return {k: make_json_serializable(v) for k, v in obj.items()}  # Recurse for nested dictionaries
    elif isinstance(obj, list):
        return [make_json_serializable(i) for i in obj]  # Recurse for lists
    else:
        return obj  # Return the object if it's already serializable

# Convert subsample to a JSON-serializable format
serializable_subsample = make_json_serializable(ext_WD_query_type)

In [154]:
with open('ext_WD_query_type.json', 'w') as json_file:
    json.dump(serializable_subsample, json_file, indent=4)

## NER type vs NER type

In [41]:
import json 

json_file_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/work/_HTR2/HTR2_ext_WD_query_type.json"

with open(json_file_path, "r") as file:
    HTR2_type = json.load(file)


tables_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/tables/"
cea_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/gt/cea.csv'
cta_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/gt/cta.csv'


os.listdir(tables_path)
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df_cea = pd.read_csv(cea_file, header=None)
df_cea["key"] = df_cea[0] + " " + df_cea[1].astype(str) + " " + df_cea[2].astype(str)
df_cea["key_col"] = df_cea[0] + " " + df_cea[2].astype(str)
cea_values_dict = dict(zip(df_cea["key_col"].values, df_cea[3].values))

cea_keys_set = set(df_cea["key"].values)
cea_values_dict_cell = dict(zip(df_cea["key"].values, df_cea[3].values))

# Function to process a single table file
def process_table_file(table_file):
    try:
        table_name = os.path.splitext(os.path.basename(table_file))[0]
        df = pd.read_csv(table_file)
        qid_to_value = {}

        for row in range(df.shape[0]):
            for col in range(df.shape[1]):
                key = f"{table_name} {row+1} {col}"
                if key in cea_keys_set:
                    cell_value = df.iloc[row, col]
                    qid = cea_values_dict_cell[key].split('/')[-1]  # Extract the QID from the URL
                    qid_to_value[qid] = cell_value
                    break  # Exit inner loop early as only one match per row/col is needed

        return qid_to_value
    except Exception as e:
        logging.error(f"Error processing {table_file}: {e}")
        return {}

# List of table files
table_files = [os.path.join(tables_path, table) for table in os.listdir(tables_path)]

# Process tables sequentially
HTR2_id_to_name = {}
for table_file in tqdm(table_files, desc="Processing tables"):
    local_key_to_cell = process_table_file(table_file)
    HTR2_id_to_name.update(local_key_to_cell)

Processing tables:   0%|          | 0/2692 [00:00<?, ?it/s]2025-01-27 17:10:29,461 - ERROR - Error processing C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/tables/._00CU42PU.csv: 'utf-8' codec can't decode byte 0xa2 in position 37: invalid start byte
2025-01-27 17:10:29,464 - ERROR - Error processing C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/tables/._00CYT0VB.csv: 'utf-8' codec can't decode byte 0xa2 in position 37: invalid start byte
2025-01-27 17:10:29,466 - ERROR - Error processing C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/tables/._00DZN9U3.csv: 'utf-8' codec can't decode byte 0xa2 in position 37: invalid start byte
2025-01-27 17:10:29,469 - ERROR - Error processing C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/tables/._00EDPHR0.csv: 'utf-8' codec can't decode byte 0xa2 in position 37: invalid star

### CREATION OF THE QUERY HARD

In [42]:
def get_query(name, value):
    name = str(name).replace('"', ' ')
    if value is not None:
        # hard filtering constraint
        query_dict = {
            "query": {
                "bool": {
                    "must": [
                        {"match": {"name": {"query": name, "boost": 2.0}}},
                        {"terms": {"extended_WDtypes": value}}  # Ensures `value` matches at least one in the array
                    ]
                }
            }
        }
        params = {
            'name': name,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 20,
            'query': json.dumps(query_dict),  # Convert the query dictionary to a JSON string
            'sort': [
                '{"popularity": {"order": "desc"}}'
            ]
        }
    
    return params


queries = []
for id, name in tqdm(HTR2_id_to_name.items()):
    if id in HTR2_type:
        types_list = HTR2_type[id]

        ########################################################
        ##  modificare se types_list è una lista di tipi
        ########################################################
    
        query = get_query(name, types_list)

        queries.append((query, id, types_list))
        if len(queries) == 4000:
            break


 14%|█▍        | 4012/28030 [00:00<00:00, 33134.42it/s]


### CREATION OF THE QUERY SOFT

In [40]:
def get_query(name, value):
    name = str(name).replace('"', ' ')
    
    if value is not None:
        # Check if value is a list. If it's a list, create multiple `term` clauses; otherwise, use a single `terms` clause.
        should_clause = []
        if isinstance(value, list):
            should_clause = [{"term": {"ext_WD_type": v}} for v in value]
        else:
            should_clause = [{"terms": {"ext_WD_type": [value]}}]

        # Build the query dictionary
        query_dict = {
            "query": {
                "bool": {
                    "must": [
                        {"match": {"name": {"query": name, "boost": 2.0}}}
                    ],
                    "should": should_clause  # Add the dynamically generated should clause
                }
            }
        }

        # Build the parameters dictionary
        params = {
            'name': name,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 20,
            'query': json.dumps(query_dict),  # Convert the query dictionary to a JSON string
            'sort': [
                '{"popularity": {"order": "desc"}}'
            ]
        }
    
        return params


queries = []
for id, name in tqdm(HTR2_id_to_name.items()):
    if id in HTR2_type:
        types_list = HTR2_type[id]

        ########################################################
        ##  modificare se types_list è una lista di tipi
        ########################################################
    
        query = get_query(name, types_list)

        queries.append((query, id, types_list))
        if len(queries) == 4000:
            break


  0%|          | 0/28030 [00:00<?, ?it/s]

 14%|█▍        | 4012/28030 [00:00<00:00, 40476.52it/s]


In [44]:
import aiohttp
import asyncio
import backoff
import nest_asyncio
import random
from tqdm import tqdm
import numpy as np

# Assume queries is a list of tuples [(param1, id1), (param2, id2), ...]

failed_queries = {}
url = 'http://localhost:5000/lookup/entity-retrieval'

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=10, 
    max_time=400
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        # Convert all params to str, int, or float
        #params = {k: (int(v) if isinstance(v, np.integer) else str(v)) for k, v in params.items()}
        async with session.get(url, params=params, headers=headers, timeout=50) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except asyncio.TimeoutError:
                print(f"Request timed out for params: {params}")
                return []  # Return an empty list to handle the timeout gracefully
            except aiohttp.ClientError as e:
                print(f"ClientError for params : {str(e)}")
                return []
            except Exception as e:
                print(f"Unexpected error for params {params}: {str(e)}")
                return []
async def process_item(session, url, id, headers, params, semaphore, pbar):
    try:
        data = await fetch(session, url, params, headers, semaphore)
    except aiohttp.ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{id}'")
            asyncio.get_event_loop().call_soon_threadsafe(pbar.update, 1)
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0


    ###################################################
    ## scandisco il candidate set in cui ho già fatto 
    ## l'overlapping dei tipi
    ###################################################
    
    #print(f"------------>{eval(params['query'])['query']['bool']['must'][1]} - # candidate: {len(data)}")
    if data:
        for item in data:
            if id == item.get('id'):
                #print(f"{item.get('name')}: es_score({item.get('es_score', 0)}), pos_score({item.get('pos_score', 0)})-> {item.get('description')}")
                asyncio.get_event_loop().call_soon_threadsafe(pbar.update, 1)
                pos_score = item.get('pos_score', 0)
                if pos_score:
                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                else:
                    mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                return mrr_increment, 1

    return 0, 0

async def main(queries, url, pbar, failed_queries):
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0

    async with aiohttp.ClientSession() as session:
        tasks = []
        for param, id, _ in queries:
            tasks.append(process_item(session, url, id, headers, param, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for (mrr_increment, count), (param, id, item_NERtype) in zip(results, queries):
            if mrr_increment == 0 and count == 0:
                failed_queries[id] = (id, item_NERtype)
                
                # redo the same query with the fuzzy
                name = param['name']
                
                # Parse the string into a Python dictionary
                query_dict = json.loads(param['query'])

                # Modify the "match" field
                if "query" in query_dict and "bool" in query_dict["query"] and "must" in query_dict["query"]["bool"]:
                    for condition in query_dict["query"]["bool"]["must"]:
                        if "match" in condition and "name" in condition["match"]:
                            condition["match"]["name"]["fuzziness"] = "AUTO"

                # Convert back to JSON string
                param['query'] = json.dumps(query_dict)
                print(param['query'])

                response = requests.get(url, params=param)
                if response.status_code == 200:
                    data = response.json()
                    #print("after call")
                    num_result = len(data) if data else 0
                    if data:
                        for item in data:
                            if id == item.get('id'):
                                pbar.update(1)  # No need to await here
                                pos_score = item.get('pos_score', 0)
                                if pos_score:
                                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                                else:
                                    mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                            
                m_mrr += mrr_increment
                cont_el += count 
            else:
                m_mrr += mrr_increment
                cont_el += count

        asyncio.get_event_loop().call_soon_threadsafe(pbar.close)

    print(f"Coverage of 2T: {cont_el / len(queries)}")
    print(f"Measure Reciprocal Rank of 2T: {m_mrr / len(queries)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(queries))
        asyncio.run(main(queries, url, pbar, failed_queries))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(queries, url, pbar, failed_queries))




  0%|          | 0/4000 [14:37<?, ?it/s]
2025-01-27 17:27:22,822 - INFO - Backing off fetch(...) for 0.3s (aiohttp.client_exceptions.ClientConnectorError: Cannot connect to host localhost:5000 ssl:default [Connect call failed ('127.0.0.1', 5000)])
2025-01-27 17:27:22,839 - INFO - Backing off fetch(...) for 0.7s (aiohttp.client_exceptions.ClientConnectorError: Cannot connect to host localhost:5000 ssl:default [Connect call failed ('::1', 5000, 0, 0)])
2025-01-27 17:27:22,858 - INFO - Backing off fetch(...) for 0.5s (aiohttp.client_exceptions.ClientConnectorError: Cannot connect to host localhost:5000 ssl:default [Connect call failed ('::1', 5000, 0, 0)])
2025-01-27 17:27:22,865 - INFO - Backing off fetch(...) for 0.2s (aiohttp.client_exceptions.ClientConnectorError: Cannot connect to host localhost:5000 ssl:default [Connect call failed ('127.0.0.1', 5000)])
2025-01-27 17:27:22,873 - INFO - Backing off fetch(...) for 0.9s (aiohttp.client_exceptions.ClientConnectorError: Cannot connect to

KeyboardInterrupt: 

In [14]:
# for each WD type inserted from the user (WD_query_type taken from CEA) now we retrieve the extended WD types until the root
# given the list of the ext_types we do the overlap with the ext_types taken from lamAPI with HARD and SOFT query

ext_query_types = []

for entity_id, type_str in WD_query_type.items():
    #print(f"{type_str}: {get_type_id(type_str)}")
    
    entity_name = key_to_cell[list(WD_query_type.keys())[0]]
    ext_query_types += list(set(retrieve_superclasses(get_type_id(type_str))))
    
    # query a lamapi dove specifico nel filtro il tipo

    # entity_id è il ground truth
    WD_candidate_types = WD_types(entity_id)  # WD_types() interroga il servizio types() ma forse è sbagliato (da implementare lato server non client)
print(ext_query_types)


KeyboardInterrupt: 

In [15]:
ext_query_types

['Q53617489',
 'Q98119401',
 'Q16334295',
 'Q20937557',
 'Q2897903',
 'Q2217301',
 'Q7725310',
 'Q1002697',
 'Q117208263',
 'Q732577',
 'Q24229398',
 'Q16887380',
 'Q15621286',
 'Q11032',
 'Q1554231',
 'Q106668099',
 'Q99527517',
 'Q43229',
 'Q3523102',
 'Q286583',
 'Q1193236',
 'Q121182',
 'Q61961344',
 'Q7048977',
 'Q58415929',
 'Q1639378',
 'Q49848',
 'Q28877',
 'Q2424752',
 'Q131085629',
 'Q488383',
 'Q58778',
 'Q117208269',
 'Q12774177',
 'Q35825432',
 'Q115668308',
 'Q854457',
 'Q26907166',
 'Q17538423',
 'Q31464082',
 'Q16686448',
 'Q107435521',
 'Q47461344',
 'Q37866906',
 'Q17172633',
 'Q106559804',
 'Q28314507',
 'Q386724',
 'Q17537576',
 'Q119648442',
 'Q234460',
 'Q5127848',
 'Q35120',
 'Q17489659',
 'Q340169',
 'Q16334298',
 'Q1261026',
 'Q16889133',
 'Q103940464',
 'Q11033',
 'Q11474',
 'Q53617489',
 'Q53617407',
 'Q193395',
 'Q28555911',
 'Q2897903',
 'Q9158768',
 'Q6671777',
 'Q10683158',
 'Q251473',
 'Q99527517',
 'Q96791170',
 'Q337060',
 'Q8205328',
 'Q104450446',
 '

## WD type vs NER type

In [None]:
# for each WD type inserted from the user (WD_WD_query_type taken from CEA) now we retrieve the extended WD types until the root
# given the list of the ext_types we do the overlap with the ext_types taken from lamAPI with HARD and SOFT query

False

In [25]:
retrieve_superclasses("Q12299841")

{'Q12299841': 'cricketer',
 'Q2066131': 'athlete',
 'Q18536342': 'competitive player',
 'Q50995749': 'sportsperson',
 'Q4197743': 'player',
 'Q215627': 'person',
 'Q830077': 'subject',
 'Q795052': 'individual',
 'Q3778211': 'legal person',
 'Q53617489': 'independent continuant',
 'Q7239': 'organism',
 'Q24229398': 'agent',
 'Q106559804': 'person or organization',
 'Q103940464': 'continuant',
 'Q223557': 'physical object',
 'Q66394244': 'physical anatomical entity',
 'Q4406616': 'concrete object',
 'Q53617407': 'material entity',
 'Q27043950': 'anatomical entity',
 'Q488383': 'object',
 'Q35120': 'entity'}

In [8]:
cta_values_dict

{'58891288_0_1117541047012405958 1': 'http://dbpedia.org/ontology/Film',
 '8468806_0_4382447409703007384 1': 'http://dbpedia.org/ontology/Lake',
 '50245608_0_871275842592178099 0': 'http://dbpedia.org/ontology/Film',
 '14067031_0_559833072073397908 1': 'http://dbpedia.org/ontology/Language',
 '8286121_0_8471791395229161598 0': 'http://dbpedia.org/ontology/Country',
 '39759273_0_1427898308030295194 1': 'http://dbpedia.org/ontology/Film',
 '14380604_4_3329235705746762392 1': 'http://dbpedia.org/ontology/Company',
 '20135078_0_7570343137119682530 3': 'http://dbpedia.org/ontology/Person',
 '29414811_6_8221428333921653560 1': 'http://dbpedia.org/ontology/VideoGame',
 '34041816_1_4749054164534706977 2': 'http://dbpedia.org/ontology/City',
 '14067031_0_559833072073397908 7': 'http://dbpedia.org/ontology/Currency',
 '71137051_0_8039724067857124984 0': 'http://dbpedia.org/ontology/Bird',
 '29414811_2_4773219892816395776 1': 'http://dbpedia.org/ontology/VideoGame',
 '99070098_0_20748727413026969

In [11]:
cea_values_dict

{'50245608_0_871275842592178099 0': 'https://www.wikidata.org/entity/Q46551',
 '22864497_0_8632623712684511496 0': 'https://www.wikidata.org/entity/Q6738126',
 '66009064_0_9148652238372261251 0': 'https://www.wikidata.org/entity/Q1094988',
 '21362676_0_6854186738074119688 1': 'https://www.wikidata.org/entity/Q463832',
 '40534006_0_4617468856744635526 1': 'https://www.wikidata.org/entity/Q452590',
 '36102169_0_7739454799295072814 2': 'https://www.wikidata.org/entity/Q1978200',
 '53822652_0_5767892317858575530 1': 'https://www.wikidata.org/entity/Q200396',
 '60319454_0_3938426910282115527 0': 'https://www.wikidata.org/entity/Q69581',
 '8468806_0_4382447409703007384 1': 'https://www.wikidata.org/entity/Q1546823',
 '33401079_0_9127583903019856402 0': 'https://www.wikidata.org/entity/Q154538',
 '99070098_0_2074872741302696997 1': 'https://www.wikidata.org/entity/Q4920755',
 '50270082_0_444360818941411589 1': 'https://www.wikidata.org/entity/Q114468',
 '29414811_12_251152470253168163 1': 'ht