In [1]:
! pip install backoff

Collecting backoff
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Installing collected packages: backoff
Successfully installed backoff-2.2.1


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import random
import json
import requests
from tqdm import tqdm
from pprint import pprint
import re
import nest_asyncio
import asyncio
import aiohttp
import backoff
from tqdm.asyncio import tqdm
from aiohttp.client_exceptions import ClientResponseError  # Add this import

# HardTableR3

In [3]:
GT_file = './data/dataset_GT/HardTableR3-2021_f3.csv'

chunk_size = 1000  # Adjust this based on your memory constraints
ids = {}
column_names = ["table_name", "row", "col", "url"] 
total_rows = sum(1 for line in open(GT_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk


with tqdm(total=total_iterations) as pbar:
    for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
        items = chunk_GT[chunk_GT['target'] == 1]
        for _, row in items.iterrows():
            if row['target'] == 1:
                ids[row["key"]] = {
                    "id": 'https://www.wikidata.org/entity/' + row['id'],
                    "name": row['name'],
                    "ed_score": row['ed_score'],
                    "jaccard_score": row['jaccard_score']
                }
        pbar.update(1)  # Update progress bar for each chunk iteration

print("Processing complete.")

100%|██████████| 999/999 [00:30<00:00, 33.10it/s]

Processing complete.





In [4]:
# find the mention in the table
tables = "./data/Dataset/Dataset/HardTablesR3/tables/"
cea_file = './data/Dataset/Dataset/HardTablesR3/gt/cea.csv'
os.listdir(tables)
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype('str') + " " + df[2].astype('str')
cea_keys = set(df["key"].values)
key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    pattern = r'^\.'
    if re.match(pattern, table):
        continue
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {row+1} {col}"
            if key in cea_keys:
                cell_value = df.iloc[row, col]
                key_to_cell[key] = cell_value

100%|██████████| 10779/10779 [03:06<00:00, 57.67it/s]  


In [5]:
cea_file = './data/Dataset/Dataset/HardTablesR3/gt/cea.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        parts = row['url'].split('/')
        wikidata_id = parts[-1]
        num_rows, num_columns = df.shape
        key = f"{row['table_name']} {row['row']} {row['col']}"
        if key in ids:
            cell_value = key_to_cell[key]
            data = ids[key]
            mentions[cell_value] = data

print("Processing complete.")

100%|██████████| 59/59 [00:07<00:00,  8.04it/s]

Processing complete.





In [6]:
sorted_mentions = sorted(mentions.items(), key=lambda x: x[1]["ed_score"])

json_file_path = "./data/HardTablesR3_sorted_mentions.json"

# Save the sorted_mentions dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(sorted_mentions, json_file, indent=4)

print(f"Sorted mentions saved to {json_file_path}")

Sorted mentions saved to ./data/HardTablesR3_sorted_mentions.json


In [3]:
####################
# READ THE JSON
#####################

json_file_path = "./data/HardTablesR3_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    HT3_sorted_mentions = json.load(file)

## Sample extraction

In [11]:
# SPLIT OVER THE QUARTILES

n = len(HT3_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = HT3_sorted_mentions[:q1_idx]
q2 = HT3_sorted_mentions[q1_idx:q2_idx]
q3 = HT3_sorted_mentions[q2_idx:q3_idx]
q4 = HT3_sorted_mentions[q3_idx:]
    

In [12]:
sample_size = 1000
HT3_sample_keys = []
HT3_sample_keys = HT3_sample_keys + random.sample(q1, sample_size)
HT3_sample_keys = HT3_sample_keys + random.sample(q2, sample_size)
HT3_sample_keys = HT3_sample_keys + random.sample(q3, sample_size)
HT3_sample_keys = HT3_sample_keys + random.sample(q4, sample_size)

## Coverage measure

In [14]:
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=50) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, el, string_name_list, url, headers, semaphore, pbar):
    params = {
        'name': str(el),
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 1000,
        'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{el}", "boost": 2.0}}}}}}]}}}}}}',
        'sort': [
                f'''{{"popularity": {{"order": "desc"}}}}'''
            ]
    }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{el}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', string_name_list[el])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pbar.update(1)  # No need to await here
                   # print(f"{el}-->t{item}")
                    #print("__________________________")
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1


    return 0, 0

async def main(HT3_sample_keys, url, pbar):
    string_name_list = {item[1]['name']: item[1]['id'] for item in HT3_sample_keys}
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for el in string_name_list.keys():
            tasks.append(process_item(session, el, string_name_list, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for (mrr_increment, count), (name, url_id) in zip(results, string_name_list.items()):
            if mrr_increment == 0 and count == 0:
                
                params = {
                    'name': name,
                    'token': 'lamapi_demo_2023',
                    'kg': 'wikidata',
                    'limit': 1000,
                    'query':  f'''{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{name}", "boost": 2.0, "fuzziness": "AUTO"}}}}}}]}}}}}}''',
                    'sort': [
                        f'''{{"popularity": {{"order": "desc"}}}}'''
                    ]
                }
                id = re.search(r'Q(\d+)$', url_id)[0]
                
                response = requests.get(url, params)
                if response.status_code == 200:
                    data = response.json()
                    #print("after call")
                    num_result = len(data) if data else 0
                    if data:
                        for item in data:
                            if id == item.get('id'):
                                pbar.update(1)  # No need to await here
                                pos_score = item.get('pos_score', 0)
                                if pos_score:
                                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                                else:
                                    mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                            
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of HardTableR3: {cont_el / len(HT3_sample_keys)}")
    print(f"Measure Reciprocal Rank of HardTableR3: {m_mrr / len(HT3_sample_keys)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(HT3_sample_keys))
        asyncio.run(main(HT3_sample_keys, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(HT3_sample_keys, url, pbar))



  0%|          | 0/4000 [00:00<?, ?it/s][A
  0%|          | 1/4000 [00:05<5:42:31,  5.14s/it][A
  0%|          | 4/4000 [00:05<1:08:33,  1.03s/it][A
  0%|          | 10/4000 [00:05<21:42,  3.06it/s] [A
  0%|          | 13/4000 [00:05<16:51,  3.94it/s][A
  0%|          | 15/4000 [00:06<16:03,  4.14it/s][A
  0%|          | 17/4000 [00:06<15:54,  4.17it/s][A
  0%|          | 19/4000 [00:06<14:02,  4.72it/s][A
  0%|          | 20/4000 [00:07<13:29,  4.92it/s][A
  1%|          | 21/4000 [00:07<14:28,  4.58it/s][A
  1%|          | 22/4000 [00:07<16:52,  3.93it/s][A
  1%|          | 24/4000 [00:08<14:00,  4.73it/s][A
  1%|          | 26/4000 [00:08<11:14,  5.89it/s][A
  1%|          | 28/4000 [00:08<09:13,  7.18it/s][A
  1%|          | 30/4000 [00:08<08:05,  8.17it/s][A
  1%|          | 31/4000 [00:08<08:04,  8.18it/s][A
  1%|          | 34/4000 [00:09<07:49,  8.45it/s][A
  1%|          | 36/4000 [00:09<07:23,  8.93it/s][A
  1%|          | 37/4000 [00:09<07:24,  8.91it/s][

Coverage of HardTableR3: 0.972
Measure Reciprocal Rank of HardTableR3: 0.9101919999999943





# HardTableR2

In [2]:
GT_file = './data/dataset_GT/HardTableR2-2021_f3.csv'

chunk_size = 1000  # Adjust this based on your memory constraints
ids = {}
column_names = ["table_name", "row", "col", "url"] 
total_rows = sum(1 for line in open(GT_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk


with tqdm(total=total_iterations) as pbar:
    for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
        items = chunk_GT[chunk_GT['target'] == 1]
        for _, row in items.iterrows():
            if row['target'] == 1:
                ids[row["key"]] = {
                    "id": 'https://www.wikidata.org/entity/' + row['id'],
                    "name": row['name'],
                    "ed_score": row['ed_score'],
                    "jaccard_score": row['jaccard_score']
                }
        pbar.update(1)  # Update progress bar for each chunk iteration

print("Processing complete.")

100%|██████████| 851/851 [00:25<00:00, 33.94it/s]

Processing complete.





In [3]:
# find the mention in the table
tables = "./data/Dataset/Dataset/HardTablesR2/tables/"
cea_file = './data/Dataset/Dataset/HardTablesR2/gt/cea.csv'
os.listdir(tables)
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype('str') + " " + df[2].astype('str')
cea_keys = set(df["key"].values)
key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    pattern = r'^\.'
    if re.match(pattern, table):
        continue
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {row+1} {col}"
            if key in cea_keys:
                cell_value = df.iloc[row, col]
                key_to_cell[key] = cell_value

100%|██████████| 2692/2692 [00:39<00:00, 68.46it/s] 


In [4]:
cea_file = './data/Dataset/Dataset/HardTablesR2/gt/cea.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        parts = row['url'].split('/')
        wikidata_id = parts[-1]
        num_rows, num_columns = df.shape
        key = f"{row['table_name']} {row['row']} {row['col']}"
        if key in ids:
            cell_value = key_to_cell[key]
            data = ids[key]
            mentions[cell_value] = data

print("Processing complete.")

100%|██████████| 48/48 [00:06<00:00,  7.34it/s]

Processing complete.





In [3]:
sorted_mentions = sorted(mentions.items(), key=lambda x: x[1]["ed_score"])

json_file_path = "./data/HardTablesR2_sorted_mentions.json"

# Save the sorted_mentions dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(sorted_mentions, json_file, indent=4)

print(f"Sorted mentions saved to {json_file_path}")

NameError: name 'mentions' is not defined

In [4]:
####################
# READ THE JSON
#####################

json_file_path = "./data/HardTablesR2_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    HT2_sorted_mentions = json.load(file)

In [None]:
# Extract the ed_score and jaccard_score values
ed_scores = [item[1]['ed_score'] for item in HT2_sorted_mentions]
jaccard_scores = [item[1]['jaccard_score'] for item in HT2_sorted_mentions]

# Convert to a pandas DataFrame for easier analysis
df = pd.DataFrame({'ED Score': ed_scores, 'Jaccard Score': jaccard_scores})

# Density Plot for ED Score and Jaccard Score
plt.figure(figsize=(10, 6))

# Plot for ED Score
sns.kdeplot(df['ED Score'], fill=True, label='ED Score')

# Plot for Jaccard Score
sns.kdeplot(df['Jaccard Score'], fill=True, label='Jaccard Score')

plt.xlabel('Score')
plt.ylabel('Density')
plt.title('Density Plot of ED and Jaccard Scores')
plt.legend(loc='upper left')  # Show legend with labels
plt.show()


## Sample extraction

In [7]:
# SPLIT OVER THE QUARTILES

n = len(HT2_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = HT2_sorted_mentions[:q1_idx]
q2 = HT2_sorted_mentions[q1_idx:q2_idx]
q3 = HT2_sorted_mentions[q2_idx:q3_idx]
q4 = HT2_sorted_mentions[q3_idx:]
    

In [8]:
sample_size = 1000
HT2_sample_keys = []
HT2_sample_keys = HT2_sample_keys + random.sample(q1, sample_size)
HT2_sample_keys = HT2_sample_keys + random.sample(q2, sample_size)
HT2_sample_keys = HT2_sample_keys + random.sample(q3, sample_size)
HT2_sample_keys = HT2_sample_keys + random.sample(q4, sample_size)

In [None]:
# Extracting ED scores and Jaccard scores
ed_scores = [score[1]['ed_score'] for score in HT2_sample_keys]
jaccard_scores = [score[1]['jaccard_score'] for score in HT2_sample_keys]

plt.figure(figsize=(8, 5))

sns.kdeplot(ed_scores, color='skyblue', label='Edit Distance Score', fill=True)
sns.kdeplot(jaccard_scores, color='salmon', label='Jaccard Score', fill=True)

plt.title('Edit Distance and Jaccard Score Density')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend(loc="upper left")

plt.show()

## Coverage Measure

In [11]:
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=50) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, el, string_name_list, url, headers, semaphore, pbar):
    params = {
        'name': str(el),
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 1000,
        'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{el}", "boost": 2.0}}}}}}]}}}}}}',
        'sort': [
                f'''{{"popularity": {{"order": "desc"}}}}'''
            ]
    }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{el}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', string_name_list[el])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pbar.update(1)  # No need to await here
                   # print(f"{el}-->t{item}")
                    #print("__________________________")
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1


    return 0, 0

async def main(R1_sample_keys, url, pbar):
    string_name_list = {item[1]['name']: item[1]['id'] for item in HT2_sample_keys}
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for el in string_name_list.keys():
            tasks.append(process_item(session, el, string_name_list, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for (mrr_increment, count), (name, url_id) in zip(results, string_name_list.items()):
            if mrr_increment == 0 and count == 0:
                
                params = {
                    'name': name,
                    'token': 'lamapi_demo_2023',
                    'kg': 'wikidata',
                    'limit': 1000,
                    'query':  f'''{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{name}", "boost": 2.0, "fuzziness": "AUTO"}}}}}}]}}}}}}''',
                    'sort': [
                        f'''{{"popularity": {{"order": "desc"}}}}'''
                    ]
                }
                id = re.search(r'Q(\d+)$', url_id)[0]
                
                response = requests.get(url, params)
                if response.status_code == 200:
                    data = response.json()
                    #print("after call")
                    num_result = len(data) if data else 0
                    if data:
                        for item in data:
                            if id == item.get('id'):
                                pbar.update(1)  # No need to await here
                                pos_score = item.get('pos_score', 0)
                                if pos_score:
                                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                                else:
                                    mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                            
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of HardTableR2: {cont_el / len(HT2_sample_keys)}")
    print(f"Measure Reciprocal Rank of HardTableR2: {m_mrr / len(HT2_sample_keys)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(HT2_sample_keys))
        asyncio.run(main(HT2_sample_keys, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(HT2_sample_keys, url, pbar))


 99%|█████████▉| 3965/4000 [12:44<00:06,  5.19it/s]

Coverage of HardTableR2: 0.99025
Measure Reciprocal Rank of HardTableR2: 0.9123277499999568





## query
Coverage of HardTableR2: 0.99

Measure Reciprocal Rank of HardTableR2: 0.912763999999958

In [None]:
import aiohttp
import asyncio
import backoff
from tqdm.asyncio import tqdm  # Import the asynchronous version of tqdm
import re
import nest_asyncio

url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, name, value, url, headers, semaphore, pbar):
    params = {
        'name': name,
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 1000,
        'query': f'''
            {{
                "query": {{
                    "bool": {{
                        "must": [
                            {{
                                "match": {{
                                    "name": {{
                                        "query": "{name}",
                                        "boost": 2.0
                                    }}
                                }}
                            }}
                        ],
                        "must": [
                            {{
                                "term": {{
                                    "NERtype": "{value[1]}"
                                }}
                            }}
                        ]
                    }}
                }}
            }}
            ''',
        'sort': [
            f'''{{"popularity": {{"order": "desc"}}}}'''
        ]
    }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except aiohttp.ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{name}'")
            await pbar.update(1)  # Await the progress bar update
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', value[0])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    await pbar.update(1)  # Await the progress bar update
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1

    return 0, 0

async def main(mentions, url):
    string_name_list = mentions
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        async with tqdm(total=len(string_name_list)) as pbar:
            for name, type in string_name_list.items():
                tasks.append(process_item(session, name, type, url, headers, semaphore, pbar))
            
            results = await asyncio.gather(*tasks)
        
        for (mrr_increment, count), (name, url_id) in zip(results, string_name_list.items()):
            if mrr_increment == 0 and count == 0:
                params = {
                    'name': name,
                    'token': 'lamapi_demo_2023',
                    'kg': 'wikidata',
                    'limit': 1000,
                    'query':  f'''{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{name}", "boost": 2.0, "fuzziness": "AUTO"}}}}}}]}}}}}}''',
                    'sort': [
                        f'''{{"popularity": {{"order": "desc"}}}}'''
                    ]
                }
                id = re.search(r'Q(\d+)$', url_id[0])[0]
                
                response = requests.get(url, params=params, headers=headers)
                if response.status_code == 200:
                    data = response.json()
                    num_result = len(data) if data else 0
                    if data:
                        for item in data:
                            if id == item.get('id'):
                                await pbar.update(1)  # Await the progress bar update
                                pos_score = item.get('pos_score', 0)
                                if pos_score:
                                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                                else:
                                    mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                            
            m_mrr += mrr_increment
            cont_el += count

    print(f"Coverage of R1: {cont_el / len(mentions)}")
    print(f"Measure Reciprocal Rank of R1: {m_mrr / len(mentions)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(mentions))
        asyncio.run(main(mentions, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(mentions, url, pbar))



# Round1_T2D_f3

In [21]:
GT_file = './data/dataset_GT/Round1_T2D_f3.csv'

chunk_size = 1000  # Adjust this based on your memory constraints
ids = {}
column_names = ["table_name", "row", "col", "url"] 
total_rows = sum(1 for line in open(GT_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk


with tqdm(total=total_iterations) as pbar:
    for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
        items = chunk_GT[chunk_GT['target'] == 1]
        for _, row in items.iterrows():
            if row['target'] == 1:
                ids[row["key"]] = {
                    "id": 'https://www.wikidata.org/entity/' + row['id'],
                    "name": row['name'],
                    "ed_score": row['ed_score'],
                    "jaccard_score": row['jaccard_score']
                }
        pbar.update(1)  # Update progress bar for each chunk iteration

print("Processing complete.")

100%|██████████| 158/158 [00:03<00:00, 41.82it/s]

Processing complete.





In [3]:
# find the mention in the table
tables = "./data/Dataset/Dataset/Round1_T2D/tables/"
cea_file = './data/Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_WD.csv'
os.listdir(tables)
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype('str') + " " + df[2].astype('str')
cea_keys = set(df["key"].values)
key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {row+1} {col}"
            if key in cea_keys:
                cell_value = df.iloc[row, col]
                key_to_cell[key] = cell_value

100%|██████████| 64/64 [00:01<00:00, 48.95it/s]


In [23]:
cea_file = './data/Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_WD.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        parts = row['url'].split('/')
        wikidata_id = parts[-1]
        num_rows, num_columns = df.shape
        key = f"{row['table_name']} {row['row']} {row['col']}"
        if key in ids:
            cell_value = key_to_cell[key]
            data = ids[key]
            mentions[cell_value] = data

print("Processing complete.")

100%|██████████| 9/9 [00:01<00:00,  5.80it/s]

Processing complete.





In [None]:
sorted_mentions = sorted(mentions.items(), key=lambda x: x[1]["ed_score"])

json_file_path = "./data/Round1_T2D_f3_sorted_mentions.json"

# Save the sorted_mentions dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(sorted_mentions, json_file, indent=4)

print(f"Sorted mentions saved to {json_file_path}")

In [18]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round1_T2D_f3_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R1_sorted_mentions = json.load(file)

In [None]:
# Extract the ed_score and jaccard_score values
ed_scores = [item[1]['ed_score'] for item in R1_sorted_mentions]
jaccard_scores = [item[1]['jaccard_score'] for item in R1_sorted_mentions]

# Convert to a pandas DataFrame for easier analysis
df = pd.DataFrame({'ED Score': ed_scores, 'Jaccard Score': jaccard_scores})

# Density Plot for ED Score and Jaccard Score
plt.figure(figsize=(10, 6))

# Plot for ED Score
sns.kdeplot(df['ED Score'], fill=True, label='ED Score')

# Plot for Jaccard Score
sns.kdeplot(df['Jaccard Score'], fill=True, label='Jaccard Score')

plt.xlabel('Score')
plt.ylabel('Density')
plt.title('Density Plot of ED and Jaccard Scores')
plt.legend(loc='upper left')  # Show legend with labels
plt.show()


In [None]:
df.describe()

## Sample extraction

In [4]:
# SPLIT OVER THE QUARTILES

n = len(R1_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R1_sorted_mentions[:q1_idx]
q2 = R1_sorted_mentions[q1_idx:q2_idx]
q3 = R1_sorted_mentions[q2_idx:q3_idx]
q4 = R1_sorted_mentions[q3_idx:]
    

In [5]:
sample_size = 1000
R1_sample_keys = []
R1_sample_keys = R1_sample_keys + random.sample(q1, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q2, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q3, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q4, sample_size)

In [None]:
# Extracting ED scores and Jaccard scores
ed_scores = [score[1]['ed_score'] for score in R1_sample_keys]
jaccard_scores = [score[1]['jaccard_score'] for score in R1_sample_keys]

plt.figure(figsize=(8, 5))

sns.kdeplot(ed_scores, color='skyblue', label='Edit Distance Score', fill=True)
sns.kdeplot(jaccard_scores, color='salmon', label='Jaccard Score', fill=True)

plt.title('Edit Distance and Jaccard Score Density')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend(loc="upper left")

plt.show()

## Coverage Measure

In [7]:
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

In [None]:
# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, el, string_name_list, url, headers, semaphore, pbar):
    params = {
        'name': str(el),
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 1000,
        'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{el}", "boost": 2.0}}}}}}]}}}}}}',
        'sort': [
                f'''{{"popularity": {{"order": "desc"}}}}'''
            ]
    }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{el}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', string_name_list[el])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pbar.update(1)  # No need to await here
                   # print(f"{el}-->t{item}")
                    #print("__________________________")
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1


    return 0, 0

async def main(R1_sample_keys, url, pbar):
    string_name_list = {item[1]['name']: item[1]['id'] for item in R1_sample_keys}
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for el in string_name_list.keys():
            tasks.append(process_item(session, el, string_name_list, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for (mrr_increment, count), (name, url_id) in zip(results, string_name_list.items()):
            if mrr_increment == 0 and count == 0:
                params = {
                    'name': name,
                    'token': 'lamapi_demo_2023',
                    'kg': 'wikidata',
                    'limit': 1000,
                    'query':  f'''{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{name}", "boost": 2.0, "fuzziness": "AUTO"}}}}}}]}}}}}}''',
                    'sort': [
                        f'''{{"popularity": {{"order": "desc"}}}}'''
                    ]
                }
                id = re.search(r'Q(\d+)$', url_id)[0]
                
                response = requests.get(url, params)
                if response.status_code == 200:
                    data = response.json()
                    #print("after call")
                    num_result = len(data) if data else 0
                    if data:
                        for item in data:
                            if id == item.get('id'):
                                pbar.update(1)  # No need to await here
                                pos_score = item.get('pos_score', 0)
                                if pos_score:
                                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                                else:
                                    mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                            
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of R1: {cont_el / len(R1_sample_keys)}")
    print(f"Measure Reciprocal Rank of R1: {m_mrr / len(R1_sample_keys)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(R1_sample_keys))
        asyncio.run(main(R1_sample_keys, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(R1_sample_keys, url, pbar))


### query
Coverage of R1: 0.962

Measure Reciprocal Rank of R1: 0.94661674999995793

# Round3_2019

In [None]:
GT_file = './data/dataset_GT/Round3_f3.csv'
chunk_size = 1000  # Adjust this based on your memory constraints

ids = {}
column_names = ["table_name", "row", "col", "url"] 
total_rows = sum(1 for line in open(GT_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk


with tqdm(total=total_iterations) as pbar:
    for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
        items = chunk_GT[chunk_GT['target'] == 1]
        for _, row in items.iterrows():
            if row['target'] == 1:
                ids[row["key"]] = {
                    "id": 'https://www.wikidata.org/entity/' + row['id'],
                    "name": row['name'],
                    "ed_score": row['ed_score'],
                    "jaccard_score": row['jaccard_score']
                }
        pbar.update(1)  # Update progress bar for each chunk iteration

print("Processing complete.")

In [None]:
# find the mention in the table
tables = "./data/Dataset/Dataset/Round3_2019/tables/"
cea_file = './data/Dataset/Dataset/Round3_2019/gt/CEA_Round3_gt_WD.csv'
os.listdir(tables)
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype('str') + " " + df[2].astype('str')
cea_keys = set(df["key"].values)
key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {row+1} {col}"
            if key in cea_keys:
                cell_value = df.iloc[row, col]
                key_to_cell[key] = cell_value

In [None]:
cea_file = './data/Dataset/Dataset/Round3_2019/gt/CEA_Round3_gt_WD.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        parts = row['url'].split('/')
        wikidata_id = parts[-1]
        num_rows, num_columns = df.shape
        key = f"{row['table_name']} {row['row']} {row['col']}"
        if key in ids:
            cell_value = key_to_cell[key]
            data = ids[key]
            mentions[cell_value] = data

print("Processing complete.")

In [None]:
sorted_mentions = sorted(mentions.items(), key=lambda x: x[1]["ed_score"])

json_file_path = "./data/Round3_2019_sorted_mentions.json"

# Save the sorted_mentions dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(sorted_mentions, json_file, indent=4)

print(f"Sorted mentions saved to {json_file_path}")

In [16]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round3_2019_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R3_sorted_mentions = json.load(file)

In [None]:
# Extract the ed_score and jaccard_score values
ed_scores = [item[1]['ed_score'] for item in R3_sorted_mentions]
jaccard_scores = [item[1]['jaccard_score'] for item in R3_sorted_mentions]

# Convert to a pandas DataFrame for easier analysis
df = pd.DataFrame({'ED Score': ed_scores, 'Jaccard Score': jaccard_scores})

# Density Plot for ED Score and Jaccard Score
plt.figure(figsize=(10, 6))

# Plot for ED Score
sns.kdeplot(df['ED Score'], fill=True, label='ED Score')

# Plot for Jaccard Score
sns.kdeplot(df['Jaccard Score'], fill=True, label='Jaccard Score')

plt.xlabel('Score')
plt.ylabel('Density')
plt.title('Density Plot of ED and Jaccard Scores')
plt.legend(loc='upper left')  # Show legend with labels
plt.show()


In [None]:
df.describe()

## Sample extraction

In [17]:
# SPLIT OVER THE QUARTILES

n = len(R3_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R3_sorted_mentions[:q1_idx]
q2 = R3_sorted_mentions[q1_idx:q2_idx]
q3 = R3_sorted_mentions[q2_idx:q3_idx]
q4 = R3_sorted_mentions[q3_idx:]
    

In [18]:
sample_size = 1000
R3_sample_keys = []
R3_sample_keys = R3_sample_keys + random.sample(q1, sample_size)
R3_sample_keys = R3_sample_keys + random.sample(q2, sample_size)
R3_sample_keys = R3_sample_keys + random.sample(q3, sample_size)
R3_sample_keys = R3_sample_keys + random.sample(q4, sample_size)

In [None]:
# Extracting ED scores and Jaccard scores
ed_scores = [score[1]['ed_score'] for score in R3_sample_keys]
jaccard_scores = [score[1]['jaccard_score'] for score in R3_sample_keys]

plt.figure(figsize=(8, 5))

sns.kdeplot(ed_scores, color='skyblue', label='Edit Distance Score', fill=True)
sns.kdeplot(jaccard_scores, color='salmon', label='Jaccard Score', fill=True)

plt.title('Edit Distance and Jaccard Score Density')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend(loc="upper left")

plt.show()

## Coverage Measure

In [None]:
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

In [19]:
# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, el, string_name_list, url, headers, semaphore, pbar):
    params = {
        'name': str(el),
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 1000,
        'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{el}", "boost": 2.0}}}}}}]}}}}}}',
        'sort': [
                f'''{{"popularity": {{"order": "desc"}}}}'''
            ]
    }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{el}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', string_name_list[el])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pbar.update(1)  # No need to await here
                   # print(f"{el}-->t{item}")
                    #print("__________________________")
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1


    return 0, 0

async def main(R3_sample_keys, url, pbar):
    string_name_list = {item[1]['name']: item[1]['id'] for item in R3_sample_keys}
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for el in string_name_list.keys():
            tasks.append(process_item(session, el, string_name_list, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for (mrr_increment, count), (name, url_id) in zip(results, string_name_list.items()):
            if mrr_increment == 0 and count == 0:
                params = {
                    'name': name,
                    'token': 'lamapi_demo_2023',
                    'kg': 'wikidata',
                    'limit': 1000,
                    'query':  f'''{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{name}", "boost": 2.0, "fuzziness": "AUTO"}}}}}}]}}}}}}''',
                    'sort': [
                        f'''{{"popularity": {{"order": "desc"}}}}'''
                    ]
                }
                id = re.search(r'Q(\d+)$', url_id)[0]
                
                response = requests.get(url, params)
                if response.status_code == 200:
                    data = response.json()
                    #print("after call")
                    num_result = len(data) if data else 0
                    if data:
                        for item in data:
                            if id == item.get('id'):
                                pbar.update(1)  # No need to await here
                                pos_score = item.get('pos_score', 0)
                                if pos_score:
                                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                                else:
                                    mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                            
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of R3: {cont_el / len(R3_sample_keys)}")
    print(f"Measure Reciprocal Rank of R3: {m_mrr / len(R3_sample_keys)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(R3_sample_keys))
        asyncio.run(main(R3_sample_keys, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(R3_sample_keys, url, pbar))


 99%|█████████▉| 3971/4000 [07:30<00:03,  8.82it/s]

Coverage of R3: 0.98625
Measure Reciprocal Rank of R3: 0.9505167499999503





### query
Coverage of R3: 0.98625

Measure Reciprocal Rank of R3: 0.95051674999995032

# 2T_Round4

In [None]:
GT_file = './data/dataset_GT/2T-2020_f3.csv'
chunk_size = 1000  # Adjust this based on your memory constraints

ids = {}
column_names = ["table_name", "row", "col", "url"] 
total_rows = sum(1 for line in open(GT_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk


with tqdm(total=total_iterations) as pbar:
    for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
        items = chunk_GT[chunk_GT['target'] == 1]
        for _, row in items.iterrows():
            ids[row["key"]] = {
                "id": 'https://www.wikidata.org/entity/' + row['id'],
                "name": row['name'],
                "ed_score": row['ed_score'],
                "jaccard_score": row['jaccard_score']
            }
        pbar.update(1)  # Update progress bar for each chunk iteration

print("Processing complete.")

In [None]:
# find the mention in the table
tables = "./data/Dataset/Dataset/2T_2020/tables/"
cea_file = './data/Dataset/Dataset/2T_2020/gt/cea.csv'
os.listdir(tables)
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype('str') + " " + df[2].astype('str')
cea_keys = set(df["key"].values)
key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {row+1} {col}"
            if key in cea_keys:
                cell_value = df.iloc[row, col]
                key_to_cell[key] = cell_value

In [None]:
cea_file = './data/Dataset/Dataset/2T_2020/gt/cea.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        parts = row['url'].split('/')
        wikidata_id = parts[-1]
        num_rows, num_columns = df.shape
        key = f"{row['table_name']} {row['row']} {row['col']}"
        if key in ids:
            cell_value = key_to_cell[key]
            data = ids[key]
            mentions[cell_value] = data

print("Processing complete.")

In [None]:
sorted_mentions = sorted(mentions.items(), key=lambda x: x[1]["ed_score"])

json_file_path = "./data/2T_Round4_sorted_mentions.json"

# Save the sorted_mentions dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(sorted_mentions, json_file, indent=4)

print(f"Sorted mentions saved to {json_file_path}")

In [2]:
####################
# READ THE JSON
#####################

json_file_path = "./data/2T_Round4_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R4_2T_sorted_mentions = json.load(file)

In [None]:
# Extract the ed_score and jaccard_score values
ed_scores = [item[1]['ed_score'] for item in R4_2T_sorted_mentions]
jaccard_scores = [item[1]['jaccard_score'] for item in R4_2T_sorted_mentions]

# Convert to a pandas DataFrame for easier analysis
df = pd.DataFrame({'ED Score': ed_scores, 'Jaccard Score': jaccard_scores})

# Density Plot for ED Score and Jaccard Score
plt.figure(figsize=(10, 6))

# Plot for ED Score
sns.kdeplot(df['ED Score'], fill=True, label='ED Score')

# Plot for Jaccard Score
sns.kdeplot(df['Jaccard Score'], fill=True, label='Jaccard Score')

plt.xlabel('Score')
plt.ylabel('Density')
plt.title('Density Plot of ED and Jaccard Scores')
plt.legend(loc='upper left')  # Show legend with labels
plt.show()


In [None]:
df.describe()

## Sample extraction

In [3]:
## Sample extraction
# SPLIT OVER THE QUARTILES

n = len(R4_2T_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R4_2T_sorted_mentions[:q1_idx]
q2 = R4_2T_sorted_mentions[q1_idx:q2_idx]
q3 = R4_2T_sorted_mentions[q2_idx:q3_idx]
q4 = R4_2T_sorted_mentions[q3_idx:]

In [4]:
sample_size = 1000
R4_2T_sample_keys = []
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q1, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q2, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q3, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q4, sample_size)

In [None]:
# Extracting ED scores and Jaccard scores
ed_scores = [score[1]['ed_score'] for score in R4_2T_sample_keys]
jaccard_scores = [score[1]['jaccard_score'] for score in R4_2T_sample_keys]

plt.figure(figsize=(8, 5))

sns.kdeplot(ed_scores, color='skyblue', label='Edit Distance Score', fill=True)
sns.kdeplot(jaccard_scores, color='salmon', label='Jaccard Score', fill=True)

plt.title('Edit Distance and Jaccard Score Density')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend(loc="upper left")

plt.show()

## Coverage Measure

In [5]:
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

In [None]:
# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, el, string_name_list, url, headers, semaphore, pbar):
    params = {
        'name': str(el),
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 100,
        'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{el}", "boost": 2.0}}}}}}]}}}}}}',
        'sort': [
                f'''{{"popularity": {{"order": "desc"}}}}'''
            ]
    }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{el}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', string_name_list[el])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pbar.update(1)  # No need to await here
                   # print(f"{el}-->t{item}")
                    #print("__________________________")
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1


    return 0, 0

async def main(R4_2T_sample_keys, url, pbar):
    string_name_list = {item[1]['name']: item[1]['id'] for item in R4_2T_sample_keys}
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for el in string_name_list.keys():
            tasks.append(process_item(session, el, string_name_list, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for (mrr_increment, count), (name, url_id) in zip(results, string_name_list.items()):
            if mrr_increment == 0 and count == 0:
                params = {
                    'name': name,
                    'token': 'lamapi_demo_2023',
                    'kg': 'wikidata',
                    'limit': 1000,
                    'query':  f'''{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{name}", "boost": 2.0, "fuzziness": "AUTO"}}}}}}]}}}}}}''',
                    'sort': [
                        f'''{{"popularity": {{"order": "desc"}}}}'''
                    ]
                }
                id = re.search(r'Q(\d+)$', url_id)[0]
                
                response = requests.get(url, params)
                if response.status_code == 200:
                    data = response.json()
                    #print("after call")
                    num_result = len(data) if data else 0
                    if data:
                        for item in data:
                            if id == item.get('id'):
                                pbar.update(1)  # No need to await here
                                pos_score = item.get('pos_score', 0)
                                if pos_score:
                                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                                else:
                                    mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                            
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of R4 2T: {cont_el / len(R4_2T_sample_keys)}")
    print(f"Measure Reciprocal Rank of R4 2T: {m_mrr / len(R4_2T_sample_keys)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(R4_2T_sample_keys))
        asyncio.run(main(R4_2T_sample_keys, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(R4_2T_sample_keys, url, pbar))


### query 
Coverage of R4 2T: 0.5255

Measure Reciprocal Rank of R4 2T: 0.5184012500000062

# Round4

In [None]:
GT_file = './data/dataset_GT/Round4_f3.csv'
chunk_size = 1000  # Adjust this based on your memory constraints

ids = {}
column_names = ["table_name", "row", "col", "url"] 
total_rows = sum(1 for line in open(GT_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk


with tqdm(total=total_iterations) as pbar:
    for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
        items = chunk_GT[chunk_GT['target'] == 1]
        for _, row in items.iterrows():
            
            if row['name'] == "imo 9528017":
                print(row)
                break

            
            ids[row["key"]] = {
                "id": 'https://www.wikidata.org/entity/' + row['id'],
                "name": row['name'],
                "ed_score": row['ed_score'],
                "jaccard_score": row['jaccard_score']
            }
        pbar.update(1)  # Update progress bar for each chunk iteration

print("Processing complete.")

In [None]:
# find the mention in the table
tables = "./data/Dataset/Dataset/Round4_2020/tables/"
cea_file = './data/Dataset/Dataset/Round4_2020/gt/cea.csv'
os.listdir(tables)
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype('str') + " " + df[2].astype('str')
cea_keys = set(df["key"].values)
key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {row+1} {col}"
            if key in cea_keys:
                cell_value = df.iloc[row, col]
                key_to_cell[key] = cell_value


In [None]:
cea_file = './data/Dataset/Dataset/Round4_2020/gt/cea.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        parts = row['url'].split('/')
        wikidata_id = parts[-1]
        num_rows, num_columns = df.shape
        key = f"{row['table_name']} {row['row']} {row['col']}"
        if key in ids:
            cell_value = key_to_cell[key]
            data = ids[key]
            mentions[cell_value] = data

print("Processing complete.")

In [None]:
sorted_mentions = sorted(mentions.items(), key=lambda x: x[1]["ed_score"])

json_file_path = "./data/Round4_sorted_mentions.json"

# Save the sorted_mentions dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(sorted_mentions, json_file, indent=4)

print(f"Sorted mentions saved to {json_file_path}")

In [26]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round4_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R4_sorted_mentions = json.load(file)

In [None]:
# Extract the ed_score and jaccard_score values
ed_scores = [item[1]['ed_score'] for item in R4_sorted_mentions]
jaccard_scores = [item[1]['jaccard_score'] for item in R4_sorted_mentions]

# Convert to a pandas DataFrame for easier analysis
df = pd.DataFrame({'ED Score': ed_scores, 'Jaccard Score': jaccard_scores})

# Density Plot for ED Score and Jaccard Score
plt.figure(figsize=(10, 6))

# Plot for ED Score
sns.kdeplot(df['ED Score'], fill=True, label='ED Score')

# Plot for Jaccard Score
sns.kdeplot(df['Jaccard Score'], fill=True, label='Jaccard Score')

plt.xlabel('Score')
plt.ylabel('Density')
plt.title('Density Plot of ED and Jaccard Scores')
plt.legend(loc='upper left')  # Show legend with labels
plt.show()


In [None]:
df.describe()

## Sample extraction

In [27]:
## Sample extraction
# SPLIT OVER THE QUARTILES

n = len(R4_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R4_sorted_mentions[:q1_idx]
q2 = R4_sorted_mentions[q1_idx:q2_idx]
q3 = R4_sorted_mentions[q2_idx:q3_idx]
q4 = R4_sorted_mentions[q3_idx:]

In [28]:
sample_size = 1000 
R4_sample_keys = []
R4_sample_keys = R4_sample_keys + random.sample(q1, sample_size)
R4_sample_keys = R4_sample_keys + random.sample(q2, sample_size)
R4_sample_keys = R4_sample_keys + random.sample(q3, sample_size)
R4_sample_keys = R4_sample_keys + random.sample(q4, sample_size)

In [None]:

# Extracting ED scores and Jaccard scores
ed_scores = [score[1]['ed_score'] for score in R4_sample_keys]
jaccard_scores = [score[1]['jaccard_score'] for score in R4_sample_keys]

plt.figure(figsize=(8, 5))

sns.kdeplot(ed_scores, color='skyblue', label='Edit Distance Score', fill=True)
sns.kdeplot(jaccard_scores, color='salmon', label='Jaccard Score', fill=True)

plt.title('Edit Distance and Jaccard Score Density')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend(loc="upper left")

plt.show()

## Coverage Measure

In [None]:
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

In [29]:
# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, el, string_name_list, url, headers, semaphore, pbar):
    params = {
        'name': str(el),
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 1000,
        'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{el}", "boost": 2.0}}}}}}]}}}}}}',
        'sort': [
                f'''{{"popularity": {{"order": "desc"}}}}'''
            ]
    }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{el}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', string_name_list[el])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pbar.update(1)  # No need to await here
                   # print(f"{el}-->t{item}")
                    #print("__________________________")
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1


    return 0, 0

async def main(R4_sample_keys, url, pbar):
    string_name_list = {item[1]['name']: item[1]['id'] for item in R4_sample_keys}
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for el in string_name_list.keys():
            tasks.append(process_item(session, el, string_name_list, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for (mrr_increment, count), (name, url_id) in zip(results, string_name_list.items()):
            if mrr_increment == 0 and count == 0:
                params = {
                    'name': name,
                    'token': 'lamapi_demo_2023',
                    'kg': 'wikidata',
                    'limit': 1000,
                    'query':  f'''{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{name}", "boost": 2.0, "fuzziness": "AUTO"}}}}}}]}}}}}}''',
                    'sort': [
                        f'''{{"popularity": {{"order": "desc"}}}}'''
                    ]
                }
                id = re.search(r'Q(\d+)$', url_id)[0]
                
                response = requests.get(url, params)
                if response.status_code == 200:
                    data = response.json()
                    #print("after call")
                    num_result = len(data) if data else 0
                    if data:
                        for item in data:
                            if id == item.get('id'):
                                pbar.update(1)  # No need to await here
                                pos_score = item.get('pos_score', 0)
                                if pos_score:
                                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                                else:
                                    mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                            
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of R4: {cont_el / len(R4_sample_keys)}")
    print(f"Measure Reciprocal Rank of R4: {m_mrr / len(R4_sample_keys)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(R4_sample_keys))
        asyncio.run(main(R4_sample_keys, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(R4_sample_keys, url, pbar))


100%|█████████▉| 3984/4000 [07:36<00:01,  8.72it/s] 


Coverage of R4: 0.99375
Measure Reciprocal Rank of R4: 0.9421867499999452


### query 
Coverage of R4: 0.99375

Measure Reciprocal Rank of R4: 0.94218674999994523

## Datasets Comparison

In [None]:
def extract_scores(data):
    return [item[1]['ed_score'] for item in data]

ed_scores_R1 = extract_scores(R1_sample_keys)
ed_scores_R3 = extract_scores(R3_sample_keys)
ed_scores_R4 = extract_scores(R4_sample_keys)
ed_scores_R4_2T = extract_scores(R4_2T_sample_keys)

# Plot the KDE plots
plt.figure(figsize=(10, 6))

sns.kdeplot(ed_scores_R1, color='skyblue', label='R1 Edit Distance Score', fill=True)
sns.kdeplot(ed_scores_R3, color='green', label='R3 Edit Distance Score', fill=True)
sns.kdeplot(ed_scores_R4, color='red', label='R4 Edit Distance Score', fill=True)
sns.kdeplot(ed_scores_R4_2T, color='purple', label='R4_2T Edit Distance Score', fill=True)

plt.xlabel('Edit Distance Score')
plt.ylabel('Density')
plt.title('Density Plot of Edit Distance Scores for Different Rounds')
plt.legend(loc='upper left')
plt.show()