In [17]:
import os
import pandas as pd
from tqdm import tqdm
import logging

from SPARQLWrapper import SPARQLWrapper, JSON
import time

In [19]:
def retrieve_superclasses(entity_id):
    """
    Retrieve all superclasses of a given Wikidata entity ID.

    Args:
        entity_id (str): The ID of the entity (e.g., "Q207784").

    Returns:
        dict: A dictionary where keys are superclass IDs, and values are their labels.
    """
    # Define the SPARQL endpoint and query
    endpoint_url = "https://query.wikidata.org/sparql"
    query = f"""
    SELECT ?superclass ?superclassLabel WHERE {{
      wd:{entity_id} (wdt:P279)* ?superclass.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """

    # Function to query the SPARQL endpoint with retries
    def query_wikidata(sparql_client, query, retries=3, delay=5):
        for attempt in range(retries):
            try:
                sparql_client.setQuery(query)
                sparql_client.setReturnFormat(JSON)
                results = sparql_client.query().convert()
                return results
            except Exception as e:
                if "429" in str(e):  # Handle Too Many Requests error
                    print(f"Rate limit hit. Retrying in {delay} seconds... (Attempt {attempt + 1}/{retries})")
                    time.sleep(delay)
                else:
                    print(f"An error occurred: {e}")
                    break
        return None

    # Set up the SPARQL client
    sparql = SPARQLWrapper(endpoint_url)

    # Execute the query with retries
    results = query_wikidata(sparql, query)

    # Process results and return as a dictionary
    if results:
        superclass_dict = {}
        for result in results["results"]["bindings"]:
            superclass_id = result["superclass"]["value"].split("/")[-1]  # Extract entity ID from the URI
            label = result["superclassLabel"]["value"]
            superclass_dict[superclass_id] = label
        return superclass_dict
    else:
        print("Failed to retrieve data after multiple attempts.")
        return {}


In [None]:

tables_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round1_T2D/tables/"
cea_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_wd.csv'
cta_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round1_T2D/gt/CTA_Round1_gt.csv'


os.listdir(tables_path)
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df_cea = pd.read_csv(cea_file, header=None)
df_cea["key"] = df_cea[0] + " " + df_cea[2].astype(str)
cea_values_dict = dict(zip(df_cea["key"].values, df_cea[3].values))


df_cta = pd.read_csv(cta_file, header=None)
df_cta["key"] = df_cta[0] + " " + df_cta[1].astype(str)
cta_values_dict = dict(zip(df_cta["key"].values, df_cta[2].values))


# Find the intersection of keys between the two dictionaries
common_keys = set(cea_values_dict.keys()).intersection(cta_values_dict.keys())

# Create a new dictionary with combined values
combined_dict = {key: [cea_values_dict[key], cta_values_dict[key]] for key in common_keys}


In [15]:
WD_user = {
    value[0].split('/')[-1]: value[1].split('/')[-1]
    for value in combined_dict.values()
}

WD_user

{'Q663665': 'VideoGame',
 'Q4390981': 'Saint',
 'Q1781530': 'Museum',
 'Q25193623': 'Film',
 'Q459477': 'Company',
 'Q817': 'Country',
 'Q9669': 'PoliticalParty',
 'Q208875': 'Newspaper',
 'Q7626741': 'Cricketer',
 'Q831445': 'Company',
 'Q69581': 'Animal',
 'Q135465': 'Film',
 'Q575014': 'Film',
 'Q5285919': 'VideoGame',
 'Q5181893': 'Lake',
 'Q463832': 'Film',
 'Q276217': 'VideoGame',
 'Q1033': 'Country',
 'Q487907': 'Company',
 'Q104123': 'Film',
 'Q206319': 'Currency',
 'Q2384003': 'Bird',
 'Q114468': 'BaseballPlayer',
 'Q375648': 'BaseballPlayer',
 'Q1094988': 'BaseballPlayer',
 'Q83495': 'Film',
 'Q1546823': 'Lake',
 'Q1011': 'Country',
 'Q14429': 'Lake',
 'Q154538': 'Person',
 'Q72538': 'Scientist',
 'Q227': 'Country',
 'Q214801': 'Film',
 'Q725552': 'Film',
 'Q200396': 'Film',
 'Q178021': 'Airport',
 'Q213911': 'VideoGame',
 'Q46551': 'Film',
 'Q452590': 'Wrestler',
 'Q639481': 'Film',
 'Q26': 'AdministrativeRegion',
 'Q61620': 'Saint',
 'Q30315633': 'Newspaper',
 'Q6738126': '

In [None]:
# for each WD type inserted from the user (WD_user taken from CEA) now we retrieve the extended WD types until the root
# given the list of the ext_types we do the overlap with the ext_types taken from lamAPI with HARD and SOFT query

False

In [25]:
retrieve_superclasses("Q12299841")

{'Q12299841': 'cricketer',
 'Q2066131': 'athlete',
 'Q18536342': 'competitive player',
 'Q50995749': 'sportsperson',
 'Q4197743': 'player',
 'Q215627': 'person',
 'Q830077': 'subject',
 'Q795052': 'individual',
 'Q3778211': 'legal person',
 'Q53617489': 'independent continuant',
 'Q7239': 'organism',
 'Q24229398': 'agent',
 'Q106559804': 'person or organization',
 'Q103940464': 'continuant',
 'Q223557': 'physical object',
 'Q66394244': 'physical anatomical entity',
 'Q4406616': 'concrete object',
 'Q53617407': 'material entity',
 'Q27043950': 'anatomical entity',
 'Q488383': 'object',
 'Q35120': 'entity'}

In [8]:
cta_values_dict

{'58891288_0_1117541047012405958 1': 'http://dbpedia.org/ontology/Film',
 '8468806_0_4382447409703007384 1': 'http://dbpedia.org/ontology/Lake',
 '50245608_0_871275842592178099 0': 'http://dbpedia.org/ontology/Film',
 '14067031_0_559833072073397908 1': 'http://dbpedia.org/ontology/Language',
 '8286121_0_8471791395229161598 0': 'http://dbpedia.org/ontology/Country',
 '39759273_0_1427898308030295194 1': 'http://dbpedia.org/ontology/Film',
 '14380604_4_3329235705746762392 1': 'http://dbpedia.org/ontology/Company',
 '20135078_0_7570343137119682530 3': 'http://dbpedia.org/ontology/Person',
 '29414811_6_8221428333921653560 1': 'http://dbpedia.org/ontology/VideoGame',
 '34041816_1_4749054164534706977 2': 'http://dbpedia.org/ontology/City',
 '14067031_0_559833072073397908 7': 'http://dbpedia.org/ontology/Currency',
 '71137051_0_8039724067857124984 0': 'http://dbpedia.org/ontology/Bird',
 '29414811_2_4773219892816395776 1': 'http://dbpedia.org/ontology/VideoGame',
 '99070098_0_20748727413026969

In [11]:
cea_values_dict

{'50245608_0_871275842592178099 0': 'https://www.wikidata.org/entity/Q46551',
 '22864497_0_8632623712684511496 0': 'https://www.wikidata.org/entity/Q6738126',
 '66009064_0_9148652238372261251 0': 'https://www.wikidata.org/entity/Q1094988',
 '21362676_0_6854186738074119688 1': 'https://www.wikidata.org/entity/Q463832',
 '40534006_0_4617468856744635526 1': 'https://www.wikidata.org/entity/Q452590',
 '36102169_0_7739454799295072814 2': 'https://www.wikidata.org/entity/Q1978200',
 '53822652_0_5767892317858575530 1': 'https://www.wikidata.org/entity/Q200396',
 '60319454_0_3938426910282115527 0': 'https://www.wikidata.org/entity/Q69581',
 '8468806_0_4382447409703007384 1': 'https://www.wikidata.org/entity/Q1546823',
 '33401079_0_9127583903019856402 0': 'https://www.wikidata.org/entity/Q154538',
 '99070098_0_2074872741302696997 1': 'https://www.wikidata.org/entity/Q4920755',
 '50270082_0_444360818941411589 1': 'https://www.wikidata.org/entity/Q114468',
 '29414811_12_251152470253168163 1': 'ht