In [1]:
import os
import pandas as pd
from tqdm import tqdm
import logging
import requests
from SPARQLWrapper import SPARQLWrapper, JSON
import time
import re

In [2]:
def WD_types(entity_id):
    """
    Queries the API to retrieve the list of type IDs for a given entity ID.

    Args:
        entity_id (str): The Wikidata ID of the entity (e.g., 'Q30').

    Returns:
        list: A list of type IDs associated with the given entity, or an empty list if an error occurs.
    """

    url = "https://lamapi.hel.sintef.cloud/entity/types?token=lamapi_demo_2023"
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    payload = {"json": [entity_id]}
    
    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()  # Raise an error for HTTP issues
        ids = response.json()
        return ids[entity_id]['types']['P31']  # Parse and return the JSON response
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []

In [3]:
def split_entity_name(entity_name):
    """
    Splits the entity name into separate words if it has more than 2 uppercase letters.
    
    Args:
        entity_name (str): The input string to check and potentially split (e.g., 'VideoGame').
    
    Returns:
        str: The modified string with words split by spaces if necessary.
    """
    # Count uppercase letters
    uppercase_count = sum(1 for char in entity_name if char.isupper())
    
    # If more than 2 uppercase letters, split before every uppercase letter
    if uppercase_count >= 2:
        return re.sub(r'(?<!^)(?=[A-Z])', ' ', entity_name)
    
    return entity_name

def get_type_id(entity_name):
    """
    Queries the API to retrieve the QID of an entity based on its name.

    Args:
        entity_name (str): The name of the entity (e.g., 'Video Game').

    Returns:
        str: The QID associated with the given entity name, or None if an error occurs or no QID is found.
    """
    
    entity_name_split = split_entity_name(entity_name)

    url = f"https://lamapi.hel.sintef.cloud/lookup/entity-retrieval"
    params = {
        "name": entity_name_split,
        "kind": "type",
        "token": "lamapi_demo_2023"
    }
    headers = {
        "accept": "application/json"
    }

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()  # Raise an error for HTTP issues
        data = response.json()
        # Assuming the API returns the QID directly or within a key, adjust if structure differs
        return data[0]['id']
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None


In [4]:
def retrieve_superclasses(entity_id):
    """
    Retrieve all superclasses of a given Wikidata entity ID.

    Args:
        entity_id (str): The ID of the entity (e.g., "Q207784").

    Returns:
        dict: A dictionary where keys are superclass IDs, and values are their labels.
    """
    # Define the SPARQL endpoint and query
    endpoint_url = "https://query.wikidata.org/sparql"
    query = f"""
    SELECT ?superclass ?superclassLabel WHERE {{
      wd:{entity_id} (wdt:P279)* ?superclass.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """

    # Function to query the SPARQL endpoint with retries
    def query_wikidata(sparql_client, query, retries=3, delay=5):
        for attempt in range(retries):
            try:
                sparql_client.setQuery(query)
                sparql_client.setReturnFormat(JSON)
                results = sparql_client.query().convert()
                return results
            except Exception as e:
                if "429" in str(e):  # Handle Too Many Requests error
                    print(f"Rate limit hit. Retrying in {delay} seconds... (Attempt {attempt + 1}/{retries})")
                    time.sleep(delay)
                else:
                    print(f"An error occurred: {e}")
                    break
        return None

    # Set up the SPARQL client
    sparql = SPARQLWrapper(endpoint_url)

    # Execute the query with retries
    results = query_wikidata(sparql, query)

    # Process results and return as a dictionary
    if results:
        superclass_dict = {}
        for result in results["results"]["bindings"]:
            superclass_id = result["superclass"]["value"].split("/")[-1]  # Extract entity ID from the URI
            label = result["superclassLabel"]["value"]
            superclass_dict[superclass_id] = label
        return superclass_dict
    else:
        print("Failed to retrieve data after multiple attempts.")
        return {}


In [11]:
retrieve_superclasses("Q25193623")

{'Q25193623': 'Swing Time'}

## Dataset Reading

In [5]:

tables_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round1_T2D/tables/"
cea_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_wd.csv'
cta_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round1_T2D/gt/CTA_Round1_gt.csv'


os.listdir(tables_path)
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df_cea = pd.read_csv(cea_file, header=None)
df_cea["key"] = df_cea[0] + " " + df_cea[1].astype(str) + " " + df_cea[2].astype(str)
df_cea["key_col"] = df_cea[0] + " " + df_cea[2].astype(str)
cea_values_dict = dict(zip(df_cea["key_col"].values, df_cea[3].values))


df_cta = pd.read_csv(cta_file, header=None)
df_cta["key"] = df_cta[0] + " " + df_cta[1].astype(str)
cta_values_dict = dict(zip(df_cta["key"].values, df_cta[2].values))


# Find the intersection of keys between the two dictionaries
common_keys = set(cea_values_dict.keys()).intersection(cta_values_dict.keys())

# Create a new dictionary with combined values
combined_dict = {key: [cea_values_dict[key], cta_values_dict[key]] for key in common_keys}


In [6]:
cea_keys_set = set(df_cea["key"].values)
cea_values_dict_cell = dict(zip(df_cea["key"].values, df_cea[3].values))

# Function to process a single table file
def process_table_file(table_file):
    try:
        table_name = os.path.splitext(os.path.basename(table_file))[0]
        df = pd.read_csv(table_file)
        qid_to_value = {}

        for row in range(df.shape[0]):
            for col in range(df.shape[1]):
                key = f"{table_name} {row+1} {col}"
                if key in cea_keys_set:
                    cell_value = df.iloc[row, col]
                    qid = cea_values_dict_cell[key].split('/')[-1]  # Extract the QID from the URL
                    qid_to_value[qid] = cell_value
                    break  # Exit inner loop early as only one match per row/col is needed

        return qid_to_value
    except Exception as e:
        logging.error(f"Error processing {table_file}: {e}")
        return {}

# List of table files
table_files = [os.path.join(tables_path, table) for table in os.listdir(tables_path)]

# Process tables sequentially
key_to_cell = {}
for table_file in tqdm(table_files, desc="Processing tables"):
    local_key_to_cell = process_table_file(table_file)
    key_to_cell.update(local_key_to_cell)

Processing tables: 100%|██████████| 64/64 [00:01<00:00, 42.76it/s]


In [16]:
key_to_cell['Q1872524']

'Dainik Bhaskar'

In [8]:
WD_query_type = {
    value[0].split('/')[-1]: value[1].split('/')[-1]
    for value in combined_dict.values()
}

WD_query_type

{'Q1872524': 'Newspaper',
 'Q25193623': 'Film',
 'Q575014': 'Film',
 'Q9669': 'PoliticalParty',
 'Q375648': 'BaseballPlayer',
 'Q7194441': 'Mountain',
 'Q72538': 'Scientist',
 'Q104123': 'Film',
 'Q199203': 'Animal',
 'Q1781530': 'Museum',
 'Q6738126': 'VideoGame',
 'Q206319': 'Currency',
 'Q1765733': 'VideoGame',
 'Q178021': 'Airport',
 'Q39379': 'AcademicJournal',
 'Q1205676': 'VideoGame',
 'Q30315633': 'Newspaper',
 'Q663665': 'VideoGame',
 'Q380519': 'TelevisionShow',
 'Q1033': 'Country',
 'Q503046': 'Film',
 'Q114468': 'BaseballPlayer',
 'Q2384003': 'Bird',
 'Q26': 'AdministrativeRegion',
 'Q5181893': 'Lake',
 'Q193439': 'Airport',
 'Q1011': 'Country',
 'Q452590': 'Wrestler',
 'Q487907': 'Company',
 'Q7626741': 'Cricketer',
 'Q831445': 'Company',
 'Q1060323': 'VideoGame',
 'Q200396': 'Film',
 'Q1094988': 'BaseballPlayer',
 'Q14429': 'Lake',
 'Q128518': 'Film',
 'Q459477': 'Company',
 'Q1546823': 'Lake',
 'Q83495': 'Film',
 'Q4920755': 'Mountain',
 'Q61620': 'Saint',
 'Q154538': 'P

In [9]:
#print(f"{type_str}: {get_type_id(type_str)}")

entity_name = key_to_cell[list(WD_query_type.keys())[0]]
ext_query_types = list(set(retrieve_superclasses("Q31")))

# query a lamapi dove specifico nel filtro il tipo

#WD_candidate_types = WD_types(entity_id)  # WD_types() interroga il servizio types() ma forse è sbagliato (da implementare lato server non client)
print(ext_query_types)

['Q31']


## Extended WD type vs WD type

In [14]:
# for each WD type inserted from the user (WD_query_type taken from CEA) now we retrieve the extended WD types until the root
# given the list of the ext_types we do the overlap with the ext_types taken from lamAPI with HARD and SOFT query

ext_query_types = []

for entity_id, type_str in WD_query_type.items():
    #print(f"{type_str}: {get_type_id(type_str)}")
    
    entity_name = key_to_cell[list(WD_query_type.keys())[0]]
    ext_query_types += list(set(retrieve_superclasses(get_type_id(type_str))))
    
    # query a lamapi dove specifico nel filtro il tipo

    # entity_id è il ground truth
    WD_candidate_types = WD_types(entity_id)  # WD_types() interroga il servizio types() ma forse è sbagliato (da implementare lato server non client)
print(ext_query_types)


KeyboardInterrupt: 

In [15]:
ext_query_types

['Q53617489',
 'Q98119401',
 'Q16334295',
 'Q20937557',
 'Q2897903',
 'Q2217301',
 'Q7725310',
 'Q1002697',
 'Q117208263',
 'Q732577',
 'Q24229398',
 'Q16887380',
 'Q15621286',
 'Q11032',
 'Q1554231',
 'Q106668099',
 'Q99527517',
 'Q43229',
 'Q3523102',
 'Q286583',
 'Q1193236',
 'Q121182',
 'Q61961344',
 'Q7048977',
 'Q58415929',
 'Q1639378',
 'Q49848',
 'Q28877',
 'Q2424752',
 'Q131085629',
 'Q488383',
 'Q58778',
 'Q117208269',
 'Q12774177',
 'Q35825432',
 'Q115668308',
 'Q854457',
 'Q26907166',
 'Q17538423',
 'Q31464082',
 'Q16686448',
 'Q107435521',
 'Q47461344',
 'Q37866906',
 'Q17172633',
 'Q106559804',
 'Q28314507',
 'Q386724',
 'Q17537576',
 'Q119648442',
 'Q234460',
 'Q5127848',
 'Q35120',
 'Q17489659',
 'Q340169',
 'Q16334298',
 'Q1261026',
 'Q16889133',
 'Q103940464',
 'Q11033',
 'Q11474',
 'Q53617489',
 'Q53617407',
 'Q193395',
 'Q28555911',
 'Q2897903',
 'Q9158768',
 'Q6671777',
 'Q10683158',
 'Q251473',
 'Q99527517',
 'Q96791170',
 'Q337060',
 'Q8205328',
 'Q104450446',
 '

## WD type vs NER type

In [None]:
# for each WD type inserted from the user (WD_WD_query_type taken from CEA) now we retrieve the extended WD types until the root
# given the list of the ext_types we do the overlap with the ext_types taken from lamAPI with HARD and SOFT query

False

In [25]:
retrieve_superclasses("Q12299841")

{'Q12299841': 'cricketer',
 'Q2066131': 'athlete',
 'Q18536342': 'competitive player',
 'Q50995749': 'sportsperson',
 'Q4197743': 'player',
 'Q215627': 'person',
 'Q830077': 'subject',
 'Q795052': 'individual',
 'Q3778211': 'legal person',
 'Q53617489': 'independent continuant',
 'Q7239': 'organism',
 'Q24229398': 'agent',
 'Q106559804': 'person or organization',
 'Q103940464': 'continuant',
 'Q223557': 'physical object',
 'Q66394244': 'physical anatomical entity',
 'Q4406616': 'concrete object',
 'Q53617407': 'material entity',
 'Q27043950': 'anatomical entity',
 'Q488383': 'object',
 'Q35120': 'entity'}

In [8]:
cta_values_dict

{'58891288_0_1117541047012405958 1': 'http://dbpedia.org/ontology/Film',
 '8468806_0_4382447409703007384 1': 'http://dbpedia.org/ontology/Lake',
 '50245608_0_871275842592178099 0': 'http://dbpedia.org/ontology/Film',
 '14067031_0_559833072073397908 1': 'http://dbpedia.org/ontology/Language',
 '8286121_0_8471791395229161598 0': 'http://dbpedia.org/ontology/Country',
 '39759273_0_1427898308030295194 1': 'http://dbpedia.org/ontology/Film',
 '14380604_4_3329235705746762392 1': 'http://dbpedia.org/ontology/Company',
 '20135078_0_7570343137119682530 3': 'http://dbpedia.org/ontology/Person',
 '29414811_6_8221428333921653560 1': 'http://dbpedia.org/ontology/VideoGame',
 '34041816_1_4749054164534706977 2': 'http://dbpedia.org/ontology/City',
 '14067031_0_559833072073397908 7': 'http://dbpedia.org/ontology/Currency',
 '71137051_0_8039724067857124984 0': 'http://dbpedia.org/ontology/Bird',
 '29414811_2_4773219892816395776 1': 'http://dbpedia.org/ontology/VideoGame',
 '99070098_0_20748727413026969

In [11]:
cea_values_dict

{'50245608_0_871275842592178099 0': 'https://www.wikidata.org/entity/Q46551',
 '22864497_0_8632623712684511496 0': 'https://www.wikidata.org/entity/Q6738126',
 '66009064_0_9148652238372261251 0': 'https://www.wikidata.org/entity/Q1094988',
 '21362676_0_6854186738074119688 1': 'https://www.wikidata.org/entity/Q463832',
 '40534006_0_4617468856744635526 1': 'https://www.wikidata.org/entity/Q452590',
 '36102169_0_7739454799295072814 2': 'https://www.wikidata.org/entity/Q1978200',
 '53822652_0_5767892317858575530 1': 'https://www.wikidata.org/entity/Q200396',
 '60319454_0_3938426910282115527 0': 'https://www.wikidata.org/entity/Q69581',
 '8468806_0_4382447409703007384 1': 'https://www.wikidata.org/entity/Q1546823',
 '33401079_0_9127583903019856402 0': 'https://www.wikidata.org/entity/Q154538',
 '99070098_0_2074872741302696997 1': 'https://www.wikidata.org/entity/Q4920755',
 '50270082_0_444360818941411589 1': 'https://www.wikidata.org/entity/Q114468',
 '29414811_12_251152470253168163 1': 'ht