In [2]:
import json
import random
import os
import pandas as pd
import re
import aiohttp
import asyncio
import backoff
import nest_asyncio
import time
from SPARQLWrapper import SPARQLWrapper, JSON
from requests import get
import numpy as np
import requests
from aiohttp import ClientResponseError
import logging
from tqdm import tqdm


In [3]:
### in case you want HT2

tables_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/tables/"
cea_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/gt/cea.csv'
os.listdir(tables_path)
# Initialize logging
#logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype(str) + " " + df[2].astype(str)
cea_values_dict = dict(zip(df["key"].values, df[3].values))
cea_keys_set = set(df["key"].values)

# Function to process a single table file
def process_table_file(table_file):
    try:
        table_name = os.path.splitext(os.path.basename(table_file))[0]
        df = pd.read_csv(table_file)
        local_key_to_cell = {}
        
        for row in range(df.shape[0]):
            for col in range(df.shape[1]):
                key = f"{table_name} {row+1} {col}"
                if key in cea_keys_set:
                    cell_value = df.iloc[row, col]
                    local_key_to_cell[key] = (cell_value, cea_values_dict[key])
                    break  # Exit inner loop early as only one match per row/col is needed
        
        return local_key_to_cell
    except Exception as e:
        logging.error(f"Error processing {table_file}: {e}")
        return {}

pattern = r'^\.'

# Create a list of file paths, excluding files that start with a dot
table_files = [os.path.join(tables_path, table) for table in os.listdir(tables_path) if not re.match(pattern, table)]

# Process tables sequentially
key_to_cell = {}
for table_file in tqdm(table_files, desc="Processing tables"):
    local_key_to_cell = process_table_file(table_file)
    key_to_cell.update(local_key_to_cell)



with open('C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/work/HT2_ner_type.json', 'r') as f:
    ner_type = json.load(f)

Processing tables: 100%|██████████| 1750/1750 [00:18<00:00, 93.90it/s] 


In [4]:
sample_size = 4000
key_to_cell_sample = dict(random.sample(list(key_to_cell.items()), sample_size))

In [5]:
def get_query(name, value):
    name = str(name).replace('"', ' ')
    if value is not None:
        ### SOFT FILTERING CONSTRAINT
        params = {
            'name': name,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 1000,
            'query' : f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{name}", "boost": 2.0}}}}}}], "should": [{{"term": {{"NERtype": "{value}"}}}}]}}}}}}',
            'sort': [
                f'''{{"popularity": {{"order": "desc"}}}}'''
            ]
        }

    else:
        params = {
            'name': name,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 1000,
            'query': f'''{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{name}", "boost": 2.0}}}}}}]}}}}}}''',
            'sort': [
                f'''{{"popularity": {{"order": "desc"}}}}'''
            ]
        }
    return params


queries = []
for key in tqdm(key_to_cell_sample):
    id_table, _, id_col = key.split(" ")
    name = key_to_cell[key][0]
    q_id = key_to_cell[key][1]
    new_key = f"{id_table} {id_col}"
    if new_key in ner_type:
        NER_type = ner_type[new_key]
        query = get_query(name, NER_type)
        match = re.search(r'Q(\d+)$', q_id)
        if match:
            data = json.loads(query['query'])
            ner_type_list = data['query']['bool']['should'][0]['term']['NERtype']
            queries.append((query, match[0], ner_type_list))



  0%|          | 0/4000 [00:00<?, ?it/s]

100%|██████████| 4000/4000 [00:00<00:00, 55612.44it/s]


In [None]:
def get_query(name, value):    
    name = name.replace('"', ' ')

    if value is not None:
        ### HARD FILTERING CONSTRAINT
        params = {
            'name': name,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 1000,
            'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{name}", "boost": 2.0}}}}}}, {{"term": {{"NERtype": "{value}"}}}}]}}}}}}',
            'sort': [
                f'''{{"popularity": {{"order": "desc"}}}}'''
            ]
        }

    else:
        params = {
            'name': name,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 1000,
            'query': f'''{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{name}", "boost": 2.0}}}}}}]}}}}}}''',
            'sort': [
                f'''{{"popularity": {{"order": "desc"}}}}'''
            ]
        }
    return params


queries = []
for key in tqdm(key_to_cell_sample):
    id_table, _, id_col = key.split(" ")
    name = key_to_cell[key][0]
    q_id = key_to_cell[key][1]
    new_key = f"{id_table} {id_col}"
    if new_key in ner_type:
        NER_type = ner_type[new_key]
        query = get_query(name, NER_type)
        match = re.search(r'Q(\d+)$', q_id)
        if match:
            data = json.loads(query['query'])
            # Navigate through the dictionary to extract the value of "NERtype"
            ner_type_list = data['query']['bool']['must'][1]['term']['NERtype']
            queries.append((query, match[0],ner_type_list))


In [6]:
import aiohttp
import asyncio
import backoff
import nest_asyncio
import random
from tqdm import tqdm
import numpy as np

# Assume queries is a list of tuples [(param1, id1), (param2, id2), ...]

failed_queries = {}
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=10, 
    max_time=400
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        # Convert all params to str, int, or float
        #params = {k: (int(v) if isinstance(v, np.integer) else str(v)) for k, v in params.items()}
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, url, id, headers, params, semaphore, pbar):
    try:
        data = await fetch(session, url, params, headers, semaphore)
    except aiohttp.ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{id}'")
            asyncio.get_event_loop().call_soon_threadsafe(pbar.update, 1)
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            if id == item.get('id'):
                asyncio.get_event_loop().call_soon_threadsafe(pbar.update, 1)
                pos_score = item.get('pos_score', 0)
                if pos_score:
                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                else:
                    mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                return mrr_increment, 1

    return 0, 0

async def main(queries, url, pbar, failed_queries):
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0

    async with aiohttp.ClientSession() as session:
        tasks = []
        for param, id, _ in queries:
            tasks.append(process_item(session, url, id, headers, param, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for (mrr_increment, count), (param, id, NERtype) in zip(results, queries):
            if mrr_increment == 0 and count == 0:
                failed_queries[id] = NERtype
            else:
                m_mrr += mrr_increment
                cont_el += count

        asyncio.get_event_loop().call_soon_threadsafe(pbar.close)

    print(f"Coverage of 2T: {cont_el / len(queries)}")
    print(f"Measure Reciprocal Rank of 2T: {m_mrr / len(queries)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(queries))
        asyncio.run(main(queries, url, pbar, failed_queries))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(queries, url, pbar, failed_queries))


 90%|█████████ | 3619/4000 [12:56<06:19,  1.00it/s]

TimeoutError: 

In [None]:
with open('C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/HT2_failed_queries_SOFT.json', 'w') as json_file:
    json.dump(failed_queries, json_file, indent=4)

In [2]:
####################
# READ THE JSON
#####################

json_file_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Round1_T2D_f3_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R1_sorted_mentions = json.load(file)


# SPLIT OVER THE QUARTILES

n = len(R1_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R1_sorted_mentions[:q1_idx]
q2 = R1_sorted_mentions[q1_idx:q2_idx]
q3 = R1_sorted_mentions[q2_idx:q3_idx]
q4 = R1_sorted_mentions[q3_idx:]

sample_size = 1000
R1_sample_keys = []
R1_sample_keys = R1_sample_keys + random.sample(q1, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q2, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q3, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q4, sample_size)

q_ids = {item[1]['name']: item[1]['id'] for item in R1_sample_keys}

In [4]:
# find the mention in the table
tables = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round1_T2D/tables/"
cta_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round1_T2D/gt/CTA_Round1_gt.csv'
os.listdir(tables)

mapping = {
    "LOC": [
        "Place", "PopulatedPlace", "City", "Country", "Region", "Mountain", "Island", "Lake", "River",
        "Park", "Building", "HistoricPlace", "Monument", "Bridge", "Road", "Airport"
    ],
    "PERS": [
        "Person", "Artist", "Athlete", "Politician", "Scientist", "Writer", "Actor", "Musician", "MilitaryPerson",
        "Religious", "Royalty", "Criminal"
    ],
    "ORG": [
        "Organisation", "Company", "EducationalInstitution", "PoliticalParty", "SportsTeam", "Non-ProfitOrganisation",
        "GovernmentAgency", "ReligiousOrganisation", "Band", "Library", "Museum", "Hospital", "University", "TradeUnion"
    ]
}

# Create reverse mapping
reverse_mapping = {v: k for k, values in mapping.items() for v in values}

# Define function to map df[2] values to their categories
def map_class_to_category(class_name):
    return reverse_mapping.get(class_name, "OTHERS")

# Apply the function and create the 'key' column
cta_keys = {}
df = pd.read_csv(cta_file, header=None)
type = df[2].astype(str).str.split('/').str[-1]
df["category"] = type.apply(map_class_to_category)
cta_keys["key"] = (df[0] + " " + df[1].astype('str'), df["category"])

key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {col}"
            if key in set(cta_keys["key"][0].values):
                tmp_index = cta_keys["key"][0].values.tolist().index(key)
                tmp_value = cta_keys["key"][1].iloc[tmp_index]
                key_to_cell[key] = tmp_value

100%|██████████| 64/64 [00:02<00:00, 30.33it/s]


In [6]:
def get_keys_from_value(d, value):
    keys = [key for key, val in d.items() if val == value]
    return keys[0]

In [7]:
cea_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_WD.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

count = 0
for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        key = f"{row['table_name']} {row['col']}"
        if key in key_to_cell.keys() and row["url"] in q_ids.values():
            count += 1
            data = key_to_cell[key]
            mentions[get_keys_from_value(q_ids, row["url"])] = (row["url"], data)

print("Processing complete.")

  0%|          | 0/9 [00:00<?, ?it/s]

100%|██████████| 9/9 [00:04<00:00,  2.17it/s]

Processing complete.





In [10]:
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'
failed_queries = {}

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []
async def process_item(session, name, value, url, headers, semaphore, pbar):
    ### SOFT FILTERING CONTSTRAINT
    params = {
        'name': name,
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 1000,
        'query': f'''
            {{
                "query": {{
                    "bool": {{
                        "must": [
                            {{
                                "match": {{
                                    "name": {{
                                        "query": "{name}",
                                        "boost": 2.0
                                    }}
                                }}
                            }}
                        ],
                        "should": [
                            {{
                                "term": {{
                                    "NERtype": "{value[1]}"
                                }}
                            }}
                        ]
                    }}
                }}
            }}
            ''',
        'sort': [
            f'''{{"popularity": {{"order": "desc"}}}}'''
        ]
    }



    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{name}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        pbar.update(1)  # No need to await here
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', value[0])

            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1

        print(f"{name}: {GT_id_match[0]} NOT FOUND in {value[1]}")
        print("___________________________")

    return 0, 0

async def main(mentions, url, pbar, failed_queries):
    string_name_list = mentions
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for name, type in string_name_list.items():
            tasks.append(process_item(session, name, type, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for (mrr_increment, count), (name, url_id) in zip(results, string_name_list.items()):
            if mrr_increment == 0 and count == 0:                
                failed_queries[url_id] = name
                            
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of R1: {cont_el / len(mentions)}")
    print(f"Measure Reciprocal Rank of R1: {m_mrr / len(mentions)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(mentions))
        asyncio.run(main(mentions, url, pbar, failed_queries))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(mentions, url, pbar))


  1%|          | 48/3865 [00:37<31:35,  2.01it/s]  

st helena pounds: Q374453 NOT FOUND in OTHERS
___________________________


  2%|▏         | 63/3865 [00:45<29:30,  2.15it/s]

abraham the syrian: Q1292819 NOT FOUND in OTHERS
___________________________


  5%|▍         | 189/3865 [01:50<12:25,  4.93it/s]  

queensland, australia: Q36074 NOT FOUND in OTHERS
___________________________


  6%|▌         | 226/3865 [02:02<17:14,  3.52it/s]

darksiders: Q30525856 NOT FOUND in OTHERS
___________________________


 10%|█         | 393/3865 [02:36<11:20,  5.10it/s]

icarus, international journal of solar system studies: Q1656088 NOT FOUND in OTHERS
___________________________


 13%|█▎        | 501/3865 [02:53<13:50,  4.05it/s]

eastern hognose snake: Q2699564 NOT FOUND in OTHERS
___________________________


 16%|█▌        | 627/3865 [03:14<05:49,  9.26it/s]

dijibouti: Q977 NOT FOUND in LOC
___________________________


 19%|█▉        | 742/3865 [03:32<07:15,  7.17it/s]

swee waxbill: Q27075727 NOT FOUND in OTHERS
___________________________


 24%|██▍       | 924/3865 [04:05<10:17,  4.76it/s]

dusky canada goose: Q27600982 NOT FOUND in OTHERS
___________________________


 25%|██▍       | 952/3865 [04:08<05:37,  8.62it/s]

carmona retusa: Q15283553 NOT FOUND in OTHERS
___________________________


 27%|██▋       | 1034/3865 [04:23<10:47,  4.38it/s]

waterhousea floribunda: Q108742313 NOT FOUND in OTHERS
___________________________


 27%|██▋       | 1041/3865 [04:24<05:44,  8.20it/s]

oldsquaw: Q26597 NOT FOUND in OTHERS
___________________________


 28%|██▊       | 1100/3865 [04:33<05:21,  8.60it/s]

northern short-tailed shrew: Q1766543 NOT FOUND in OTHERS
___________________________


 33%|███▎      | 1272/3865 [04:59<03:52, 11.14it/s]

ireland: Q27 NOT FOUND in OTHERS
___________________________


 38%|███▊      | 1466/3865 [05:30<06:05,  6.56it/s]

jeff leonard: Q3176709 NOT FOUND in OTHERS
___________________________


 40%|████      | 1555/3865 [05:46<06:03,  6.35it/s]

bill hutchinson: Q4909545 NOT FOUND in OTHERS
___________________________


 41%|████▏     | 1596/3865 [05:52<05:22,  7.03it/s]

eggert lake: Q491288 NOT FOUND in LOC
___________________________


 43%|████▎     | 1657/3865 [06:03<06:29,  5.67it/s]

achnatherum speciosum: Q15507171 NOT FOUND in OTHERS
___________________________


 43%|████▎     | 1660/3865 [06:04<06:21,  5.78it/s]

cyril, apostle to the slavs: Q239925 NOT FOUND in OTHERS
___________________________


 43%|████▎     | 1675/3865 [06:06<05:29,  6.65it/s]

african citril: Q27075797 NOT FOUND in OTHERS
___________________________


 46%|████▌     | 1771/3865 [06:22<03:12, 10.89it/s]

raven squad: Q5237049 NOT FOUND in OTHERS
___________________________


 47%|████▋     | 1814/3865 [06:30<06:06,  5.60it/s]

green wood-hoopoe: Q811583 NOT FOUND in OTHERS
___________________________


 47%|████▋     | 1819/3865 [06:30<04:10,  8.17it/s]

lac a l'eau-claire: Q1699973 NOT FOUND in LOC
___________________________


 48%|████▊     | 1844/3865 [06:34<04:14,  7.93it/s]

pale flycatcher: Q1318056 NOT FOUND in OTHERS
___________________________


 50%|█████     | 1949/3865 [06:53<05:49,  5.49it/s]

oplismenus burmannii: Q13936835 NOT FOUND in OTHERS
___________________________


 52%|█████▏    | 2017/3865 [07:05<04:14,  7.25it/s]

carex heliophila: Q2938524 NOT FOUND in OTHERS
___________________________


 53%|█████▎    | 2036/3865 [07:08<05:09,  5.92it/s]

lake james, north carolina: Q1485910 NOT FOUND in LOC
___________________________


 58%|█████▊    | 2233/3865 [07:41<03:25,  7.94it/s]

new south wales, australia: Q3224 NOT FOUND in OTHERS
___________________________


 60%|██████    | 2336/3865 [07:56<03:25,  7.42it/s]

eastern screech-owl: Q251939 NOT FOUND in OTHERS
___________________________


 62%|██████▏   | 2401/3865 [08:06<03:02,  8.03it/s]

egyptian rousette: Q754983 NOT FOUND in OTHERS
___________________________


 64%|██████▎   | 2457/3865 [08:15<05:26,  4.31it/s]

bahamas, the: Q778 NOT FOUND in LOC
___________________________


 67%|██████▋   | 2594/3865 [08:38<03:26,  6.17it/s]

eagle owl: Q214293 NOT FOUND in OTHERS
___________________________


 67%|██████▋   | 2599/3865 [08:38<02:48,  7.51it/s]

koninklijk museum van het leger en de krijgsgeschiedenis: Q1395176 NOT FOUND in ORG
___________________________


 67%|██████▋   | 2603/3865 [08:39<03:40,  5.73it/s]

african pied hornbill: Q226128 NOT FOUND in OTHERS
___________________________


 69%|██████▉   | 2681/3865 [08:53<02:02,  9.67it/s]

lac st-jean: Q979922 NOT FOUND in LOC
___________________________


 70%|███████   | 2713/3865 [08:57<02:13,  8.62it/s]

southern water snake: Q2065834 NOT FOUND in OTHERS
___________________________


 73%|███████▎  | 2827/3865 [09:16<03:20,  5.19it/s]

haronga madagascariensis: Q5194906 NOT FOUND in OTHERS
___________________________


 76%|███████▋  | 2948/3865 [09:36<02:03,  7.40it/s]

white gopher snake: Q1663774 NOT FOUND in OTHERS
___________________________


 77%|███████▋  | 2968/3865 [09:38<02:05,  7.16it/s]

african pygmy-goose: Q386949 NOT FOUND in OTHERS
___________________________


 78%|███████▊  | 3025/3865 [09:47<01:51,  7.50it/s]

brown snake-eagle: Q549126 NOT FOUND in OTHERS
___________________________


 80%|███████▉  | 3078/3865 [09:56<02:15,  5.83it/s]

charley root: Q5085471 NOT FOUND in OTHERS
___________________________


 80%|████████  | 3095/3865 [09:59<01:49,  7.04it/s]

little greenbul: Q25249305 NOT FOUND in OTHERS
___________________________


 83%|████████▎ | 3208/3865 [10:16<01:31,  7.16it/s]

crawfish frog: Q26849093 NOT FOUND in OTHERS
___________________________


 84%|████████▎ | 3231/3865 [10:19<01:21,  7.83it/s]

bud man: Q125074 NOT FOUND in ORG
___________________________


 85%|████████▌ | 3298/3865 [10:29<01:09,  8.15it/s]

banded martin: Q1589557 NOT FOUND in OTHERS
___________________________


 86%|████████▌ | 3312/3865 [10:31<01:52,  4.90it/s]

african scops-owl: Q1270188 NOT FOUND in OTHERS
___________________________


 90%|████████▉ | 3463/3865 [10:53<00:37, 10.62it/s]

southern flying squirrel: Q913350 NOT FOUND in OTHERS
___________________________


 90%|█████████ | 3491/3865 [10:58<00:45,  8.17it/s]

black cuckoo-shrike: Q1306580 NOT FOUND in OTHERS
___________________________


 91%|█████████ | 3524/3865 [11:04<01:06,  5.14it/s]

schedonorus arundinaceus: Q157922 NOT FOUND in OTHERS
___________________________


 92%|█████████▏| 3562/3865 [11:10<00:41,  7.38it/s]

springhare: Q3027980 NOT FOUND in OTHERS
___________________________


 92%|█████████▏| 3574/3865 [11:12<00:56,  5.14it/s]

sharpe's starling: Q27075613 NOT FOUND in OTHERS
___________________________


 93%|█████████▎| 3594/3865 [11:16<00:45,  5.91it/s]

worm snake: Q2940133 NOT FOUND in OTHERS
___________________________


 97%|█████████▋| 3730/3865 [11:37<00:17,  7.67it/s]

pickerel frog: Q28035920 NOT FOUND in OTHERS
___________________________


 97%|█████████▋| 3749/3865 [11:40<00:15,  7.25it/s]

baystones: Q7968079 NOT FOUND in LOC
___________________________


 97%|█████████▋| 3761/3865 [11:43<00:24,  4.20it/s]

yellow-spotted barbet: Q5734788 NOT FOUND in OTHERS
___________________________


 98%|█████████▊| 3785/3865 [11:46<00:11,  7.11it/s]

southeastern five-lined skink: Q137678 NOT FOUND in OTHERS
___________________________


 99%|█████████▉| 3839/3865 [11:55<00:03,  6.72it/s]

matrix, the: Q83495 NOT FOUND in OTHERS
___________________________


100%|█████████▉| 3862/3865 [11:58<00:00,  5.38it/s]

Coverage of R1: 0.9844760672703752
Measure Reciprocal Rank of R1: 0.9581293661060406





In [50]:
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'
failed_queries = {}

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []
async def process_item(session, name, value, url, headers, semaphore, pbar):

    ### HARD FILTERING CONTSTRAINT
    params = {
        'name': name,
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 1000,
        'query': f'''
            {{
                "query": {{
                    "bool": {{
                        "must": [
                            {{
                                "match": {{
                                    "name": {{
                                        "query": "{name}",
                                        "boost": 2.0
                                    }}
                                }}
                            }},
                            {{
                                "term": {{
                                    "NERtype": "{value[1]}"
                                }}
                            }}
                        ]
                    }}
                }}
            }}
            ''',
        'sort': [
            f'''{{"popularity": {{"order": "desc"}}}}'''
        ]
    }


    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{name}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        pbar.update(1)  # No need to await here
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', value[0])

            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1

        print(f"{name}: {GT_id_match[0]} NOT FOUND in {value[1]}")
        print("___________________________")

    return 0, 0

async def main(mentions, url, pbar, failed_queries):
    string_name_list = mentions
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for name, type in string_name_list.items():
            tasks.append(process_item(session, name, type, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for (mrr_increment, count), (name, url_id) in zip(results, string_name_list.items()):
            if mrr_increment == 0 and count == 0:             
                failed_queries[url_id] = name


            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of R1: {cont_el / len(mentions)}")
    print(f"Measure Reciprocal Rank of R1: {m_mrr / len(mentions)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(mentions))
        asyncio.run(main(mentions, url, pbar, failed_queries))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(mentions, url, pbar))


100%|██████████| 4000/4000 [00:00<00:00, 5060.35it/s]


In [105]:
import aiohttp
import asyncio
import backoff
import nest_asyncio
import random
from tqdm import tqdm
import numpy as np

# Assume queries is a list of tuples [(param1, id1), (param2, id2), ...]

failed_queries = {}
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=10, 
    max_time=400
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        # Convert all params to str, int, or float
        params = {k: (int(v) if isinstance(v, np.integer) else str(v)) for k, v in params.items()}
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, url, id, headers, params, semaphore, pbar):
    try:
        data = await fetch(session, url, params, headers, semaphore)
    except aiohttp.ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{id}'")
            asyncio.get_event_loop().call_soon_threadsafe(pbar.update, 1)
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            if id == item.get('id'):
                asyncio.get_event_loop().call_soon_threadsafe(pbar.update, 1)
                pos_score = item.get('pos_score', 0)
                if pos_score:
                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                else:
                    mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                return mrr_increment, 1

    return 0, 0

async def main(queries, url, pbar, failed_queries):
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0

    async with aiohttp.ClientSession() as session:
        tasks = []
        for param, id in queries:
            tasks.append(process_item(session, url, id, headers, param, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for (mrr_increment, count), (param, id) in zip(results, queries):
            if mrr_increment == 0 and count == 0:
                failed_queries[id] = param
            else:
                m_mrr += mrr_increment
                cont_el += count

        asyncio.get_event_loop().call_soon_threadsafe(pbar.close)

    print(f"Coverage of 2T: {cont_el / len(queries)}")
    print(f"Measure Reciprocal Rank of 2T: {m_mrr / len(queries)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(queries))
        asyncio.run(main(queries, url, pbar, failed_queries))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(queries, url, pbar, failed_queries))


 12%|█▏        | 463/4000 [06:03<46:13,  1.28it/s]
  1%|          | 45/4000 [00:30<39:26,  1.67it/s]  2024-08-01 14:41:40,512 - INFO - Backing off fetch(...) for 0.0s (asyncio.exceptions.TimeoutError)
2024-08-01 14:41:40,514 - INFO - Backing off fetch(...) for 0.2s (asyncio.exceptions.TimeoutError)
2024-08-01 14:41:40,515 - INFO - Backing off fetch(...) for 0.9s (asyncio.exceptions.TimeoutError)
  2%|▏         | 99/4000 [00:52<26:48,  2.42it/s]  2024-08-01 14:42:02,516 - INFO - Backing off fetch(...) for 0.9s (asyncio.exceptions.TimeoutError)
  3%|▎         | 103/4000 [00:54<28:58,  2.24it/s]2024-08-01 14:42:04,495 - INFO - Backing off fetch(...) for 0.9s (asyncio.exceptions.TimeoutError)
2024-08-01 14:42:04,495 - INFO - Backing off fetch(...) for 0.9s (asyncio.exceptions.TimeoutError)
2024-08-01 14:42:04,498 - INFO - Backing off fetch(...) for 0.1s (asyncio.exceptions.TimeoutError)
  5%|▌         | 215/4000 [01:39<18:40,  3.38it/s]  2024-08-01 14:42:49,510 - INFO - Backing off fetch(.

In [52]:
with open('C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/HT2_failed_queries_SOFT.json', 'w') as json_file:
    json.dump(failed_queries, json_file, indent=4)

In [53]:
with open('C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/2T_failed_queries_HARD.json', 'r') as f:
    failed_queries_hard = json.load(f)

with open('C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/2T_failed_queries_SOFT.json', 'r') as f:
    failed_queries_soft = json.load(f)

In [58]:
len(set(failed_queries_hard.keys()) & set(failed_queries_soft.keys()))

581

In [55]:
print(f"failed_queries_hard: {len(failed_queries_hard)} vs failed_queries_soft: {len(failed_queries_soft)}")

failed_queries_hard: 590 vs failed_queries_soft: 587
