In [1]:
! pip install backoff



In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import random
import json
import requests
from tqdm import tqdm
from pprint import pprint
import re
import nest_asyncio
import asyncio
import aiohttp
import backoff
from tqdm.asyncio import tqdm
from aiohttp.client_exceptions import ClientResponseError  # Add this import

# Round1_T2D_f3

In [None]:
GT_file = './data/dataset_GT/Round1_T2D_f3.csv'

chunk_size = 1000  # Adjust this based on your memory constraints
ids = {}
column_names = ["table_name", "row", "col", "url"] 
total_rows = sum(1 for line in open(GT_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk


with tqdm(total=total_iterations) as pbar:
    for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
        items = chunk_GT[chunk_GT['target'] == 1]
        for _, row in items.iterrows():
            if row['target'] == 1:
                ids[row["key"]] = {
                    "id": 'https://www.wikidata.org/entity/' + row['id'],
                    "name": row['name'],
                    "ed_score": row['ed_score'],
                    "jaccard_score": row['jaccard_score']
                }
        pbar.update(1)  # Update progress bar for each chunk iteration

print("Processing complete.")

In [None]:
# find the mention in the table
tables = "./data/Dataset/Dataset/Round1_T2D/tables/"
cea_file = './data/Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_WD.csv'
os.listdir(tables)
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype('str') + " " + df[2].astype('str')
cea_keys = set(df["key"].values)
key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {row+1} {col}"
            if key in cea_keys:
                cell_value = df.iloc[row, col]
                key_to_cell[key] = cell_value

In [None]:
cea_file = './data/Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_WD.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        parts = row['url'].split('/')
        wikidata_id = parts[-1]
        num_rows, num_columns = df.shape
        key = f"{row['table_name']} {row['row']} {row['col']}"
        if key in ids:
            cell_value = key_to_cell[key]
            data = ids[key]
            mentions[cell_value] = data

print("Processing complete.")

In [None]:
sorted_mentions = sorted(mentions.items(), key=lambda x: x[1]["ed_score"])

json_file_path = "./data/Round1_T2D_f3_sorted_mentions.json"

# Save the sorted_mentions dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(sorted_mentions, json_file, indent=4)

print(f"Sorted mentions saved to {json_file_path}")

In [2]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round1_T2D_f3_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R1_sorted_mentions = json.load(file)

In [None]:
# Extract the ed_score and jaccard_score values
ed_scores = [item[1]['ed_score'] for item in R1_sorted_mentions]
jaccard_scores = [item[1]['jaccard_score'] for item in R1_sorted_mentions]

# Convert to a pandas DataFrame for easier analysis
df = pd.DataFrame({'ED Score': ed_scores, 'Jaccard Score': jaccard_scores})

# Density Plot for ED Score and Jaccard Score
plt.figure(figsize=(10, 6))

# Plot for ED Score
sns.kdeplot(df['ED Score'], fill=True, label='ED Score')

# Plot for Jaccard Score
sns.kdeplot(df['Jaccard Score'], fill=True, label='Jaccard Score')

plt.xlabel('Score')
plt.ylabel('Density')
plt.title('Density Plot of ED and Jaccard Scores')
plt.legend(loc='upper left')  # Show legend with labels
plt.show()


In [None]:
df.describe()

## Sample extraction

In [3]:
# SPLIT OVER THE QUARTILES

n = len(R1_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R1_sorted_mentions[:q1_idx]
q2 = R1_sorted_mentions[q1_idx:q2_idx]
q3 = R1_sorted_mentions[q2_idx:q3_idx]
q4 = R1_sorted_mentions[q3_idx:]
    

In [4]:
sample_size = int(len(R1_sorted_mentions)/40)  
R1_sample_keys = []
R1_sample_keys = R1_sample_keys + random.sample(q1, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q2, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q3, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q4, sample_size)

In [None]:
# Extracting ED scores and Jaccard scores
ed_scores = [score[1]['ed_score'] for score in R1_sample_keys]
jaccard_scores = [score[1]['jaccard_score'] for score in R1_sample_keys]

plt.figure(figsize=(8, 5))

sns.kdeplot(ed_scores, color='skyblue', label='Edit Distance Score', fill=True)
sns.kdeplot(jaccard_scores, color='salmon', label='Jaccard Score', fill=True)

plt.title('Edit Distance and Jaccard Score Density')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend(loc="upper left")

plt.show()

## Coverage Measure

In [5]:
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

In [None]:
# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(query, session, el, string_name_list, url, headers, semaphore, pbar):
    params = {
        'name': str(el),
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 1000,
        'query': query
    }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{el}'")
            return 0, 0, False
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', string_name_list[el])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                   # print(f"{el}-->t{item}")
                    #print("__________________________")
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1, True

    
    ####################################
    # FUZZY SEARCH
    ####################################
    #print("FUZZZZYYYYY")
    


    return 0, 0, False

async def main(R1_sample_keys, url, pbar):
    string_name_list = {item[1]['name']: item[1]['id'] for item in R1_sample_keys}
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for el in string_name_list.keys():
            query = f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{el}", "boost": 2.0}}}}}}, {{"range": {{"ntoken": {{"gte": 0, "lte": 3}}}}}}]}}}}}}'
            pbar.update(1)  # No need to await here
            mrr_increment, count, flag = await process_item(query, session, el, string_name_list, url, headers, semaphore, pbar)
            if flag:
                tasks.append((mrr_increment, count))
            else:
                query = f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{str(el)}", "fuzziness": "AUTO", "boost": 2.0}}}}}}, {{"range": {{"ntoken": {{"gte": 0, "lte": 3}}}}}}]}}}}}}'
                mrr_increment, count, _ = await process_item(query, session, el, string_name_list, url, headers, semaphore, pbar)
                tasks.append((mrr_increment, count))

        print(tasks)
        #results = await asyncio.gather(*tasks)
        
        for mrr_increment, count in tasks:
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of R1: {cont_el / len(R1_sample_keys)}")
    print(f"Measure Reciprocal Rank of R1: {m_mrr / len(R1_sample_keys)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(R1_sample_keys))
        await asyncio.run(main(R1_sample_keys, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(R1_sample_keys, url, pbar))


  1%|          | 5/608 [00:08<21:00,  2.09s/it]

In [11]:
print(f"Coverage of R1: {cont_el / len(R1_sample_keys)}")
print(f"Measure Reciprocal Rank of R1: {m_mrr / len(R1_sample_keys)}")

NameError: name 'cont_el' is not defined

In [10]:
# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, el, string_name_list, url, headers, semaphore, pbar):
    params = {
        'name': str(el),
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 1000,
        'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{el}", "boost": 2.0}}}}}}]}}}}}}'
    }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{el}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', string_name_list[el])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pbar.update(1)  # No need to await here
                   # print(f"{el}-->t{item}")
                    #print("__________________________")
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1


    return 0, 0

async def main(R1_sample_keys, url, pbar):
    string_name_list = {item[1]['name']: item[1]['id'] for item in R1_sample_keys}
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for el in string_name_list.keys():
            tasks.append(process_item(session, el, string_name_list, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for mrr_increment, count in results:
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of R1: {cont_el / len(R1_sample_keys)}")
    print(f"Measure Reciprocal Rank of R1: {m_mrr / len(R1_sample_keys)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(R1_sample_keys))
        asyncio.run(main(R1_sample_keys, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(R1_sample_keys, url, pbar))


 94%|█████████▍| 572/608 [00:32<00:02, 17.85it/s]

Coverage of R1: 0.9407894736842105
Measure Reciprocal Rank of R1: 0.919363486842111





### base query
Coverage of R1: 0.962171052631579

Measure Reciprocal Rank of 1T: 0.937271381578953

### popularity based query
Coverage of R1: 0.8552631578947368

Measure Reciprocal Rank of R1: 0.82314144736842513

# Round3_2019

In [None]:
GT_file = './data/dataset_GT/Round3_f3.csv'
chunk_size = 1000  # Adjust this based on your memory constraints

ids = {}
column_names = ["table_name", "row", "col", "url"] 
total_rows = sum(1 for line in open(GT_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk


with tqdm(total=total_iterations) as pbar:
    for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
        items = chunk_GT[chunk_GT['target'] == 1]
        for _, row in items.iterrows():
            if row['target'] == 1:
                ids[row["key"]] = {
                    "id": 'https://www.wikidata.org/entity/' + row['id'],
                    "name": row['name'],
                    "ed_score": row['ed_score'],
                    "jaccard_score": row['jaccard_score']
                }
        pbar.update(1)  # Update progress bar for each chunk iteration

print("Processing complete.")

In [None]:
# find the mention in the table
tables = "./data/Dataset/Dataset/Round3_2019/tables/"
cea_file = './data/Dataset/Dataset/Round3_2019/gt/CEA_Round3_gt_WD.csv'
os.listdir(tables)
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype('str') + " " + df[2].astype('str')
cea_keys = set(df["key"].values)
key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {row+1} {col}"
            if key in cea_keys:
                cell_value = df.iloc[row, col]
                key_to_cell[key] = cell_value

In [None]:
cea_file = './data/Dataset/Dataset/Round3_2019/gt/CEA_Round3_gt_WD.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        parts = row['url'].split('/')
        wikidata_id = parts[-1]
        num_rows, num_columns = df.shape
        key = f"{row['table_name']} {row['row']} {row['col']}"
        if key in ids:
            cell_value = key_to_cell[key]
            data = ids[key]
            mentions[cell_value] = data

print("Processing complete.")

In [None]:
sorted_mentions = sorted(mentions.items(), key=lambda x: x[1]["ed_score"])

json_file_path = "./data/Round3_2019_sorted_mentions.json"

# Save the sorted_mentions dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(sorted_mentions, json_file, indent=4)

print(f"Sorted mentions saved to {json_file_path}")

In [11]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round3_2019_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R3_sorted_mentions = json.load(file)

In [None]:
# Extract the ed_score and jaccard_score values
ed_scores = [item[1]['ed_score'] for item in R3_sorted_mentions]
jaccard_scores = [item[1]['jaccard_score'] for item in R3_sorted_mentions]

# Convert to a pandas DataFrame for easier analysis
df = pd.DataFrame({'ED Score': ed_scores, 'Jaccard Score': jaccard_scores})

# Density Plot for ED Score and Jaccard Score
plt.figure(figsize=(10, 6))

# Plot for ED Score
sns.kdeplot(df['ED Score'], fill=True, label='ED Score')

# Plot for Jaccard Score
sns.kdeplot(df['Jaccard Score'], fill=True, label='Jaccard Score')

plt.xlabel('Score')
plt.ylabel('Density')
plt.title('Density Plot of ED and Jaccard Scores')
plt.legend(loc='upper left')  # Show legend with labels
plt.show()


In [None]:
df.describe()

## Sample extraction

In [12]:
# SPLIT OVER THE QUARTILES

n = len(R3_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R3_sorted_mentions[:q1_idx]
q2 = R3_sorted_mentions[q1_idx:q2_idx]
q3 = R3_sorted_mentions[q2_idx:q3_idx]
q4 = R3_sorted_mentions[q3_idx:]
    

In [13]:
sample_size = int(len(R3_sorted_mentions)/40) 
R3_sample_keys = []
R3_sample_keys = R3_sample_keys + random.sample(q1, sample_size)
R3_sample_keys = R3_sample_keys + random.sample(q2, sample_size)
R3_sample_keys = R3_sample_keys + random.sample(q3, sample_size)
R3_sample_keys = R3_sample_keys + random.sample(q4, sample_size)

In [None]:
# Extracting ED scores and Jaccard scores
ed_scores = [score[1]['ed_score'] for score in R3_sample_keys]
jaccard_scores = [score[1]['jaccard_score'] for score in R3_sample_keys]

plt.figure(figsize=(8, 5))

sns.kdeplot(ed_scores, color='skyblue', label='Edit Distance Score', fill=True)
sns.kdeplot(jaccard_scores, color='salmon', label='Jaccard Score', fill=True)

plt.title('Edit Distance and Jaccard Score Density')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend(loc="upper left")

plt.show()

## Coverage Measure

In [None]:
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

In [14]:
# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, el, string_name_list, url, headers, semaphore, pbar):
    params = {
        'name': str(el),
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 1000,
        'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{el}", "boost": 2.0}}}}}}]}}}}}}'
    }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{el}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', string_name_list[el])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pbar.update(1)  # No need to await here
                   # print(f"{el}-->t{item}")
                    #print("__________________________")
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1


    return 0, 0

async def main(R3_sample_keys, url, pbar):
    string_name_list = {item[1]['name']: item[1]['id'] for item in R3_sample_keys}
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for el in string_name_list.keys():
            tasks.append(process_item(session, el, string_name_list, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for mrr_increment, count in results:
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of R1: {cont_el / len(R3_sample_keys)}")
    print(f"Measure Reciprocal Rank of R1: {m_mrr / len(R3_sample_keys)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(R3_sample_keys))
        asyncio.run(main(R3_sample_keys, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(R3_sample_keys, url, pbar))


 98%|█████████▊| 14902/15280 [12:50<00:19, 19.35it/s]

Coverage of R1: 0.9752617801047121
Measure Reciprocal Rank of R1: 0.9304014397904427





### base query
Coverage of R3: 0.974738219895288

Measure Reciprocal Rank of R3: 0.929046596858504

### popularity base query

Coverage of R1: 0.9752617801047121

Measure Reciprocal Rank of R1: 0.93040143979044272

# 2T_Round4

In [None]:
GT_file = './data/dataset_GT/2T-2020_f3.csv'
chunk_size = 1000  # Adjust this based on your memory constraints

ids = {}
column_names = ["table_name", "row", "col", "url"] 
total_rows = sum(1 for line in open(GT_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk


with tqdm(total=total_iterations) as pbar:
    for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
        items = chunk_GT[chunk_GT['target'] == 1]
        for _, row in items.iterrows():
            ids[row["key"]] = {
                "id": 'https://www.wikidata.org/entity/' + row['id'],
                "name": row['name'],
                "ed_score": row['ed_score'],
                "jaccard_score": row['jaccard_score']
            }
        pbar.update(1)  # Update progress bar for each chunk iteration

print("Processing complete.")

In [None]:
# find the mention in the table
tables = "./data/Dataset/Dataset/2T_2020/tables/"
cea_file = './data/Dataset/Dataset/2T_2020/gt/cea.csv'
os.listdir(tables)
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype('str') + " " + df[2].astype('str')
cea_keys = set(df["key"].values)
key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {row+1} {col}"
            if key in cea_keys:
                cell_value = df.iloc[row, col]
                key_to_cell[key] = cell_value

In [None]:
cea_file = './data/Dataset/Dataset/2T_2020/gt/cea.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        parts = row['url'].split('/')
        wikidata_id = parts[-1]
        num_rows, num_columns = df.shape
        key = f"{row['table_name']} {row['row']} {row['col']}"
        if key in ids:
            cell_value = key_to_cell[key]
            data = ids[key]
            mentions[cell_value] = data

print("Processing complete.")

In [None]:
sorted_mentions = sorted(mentions.items(), key=lambda x: x[1]["ed_score"])

json_file_path = "./data/2T_Round4_sorted_mentions.json"

# Save the sorted_mentions dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(sorted_mentions, json_file, indent=4)

print(f"Sorted mentions saved to {json_file_path}")

In [15]:
####################
# READ THE JSON
#####################

json_file_path = "./data/2T_Round4_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R4_2T_sorted_mentions = json.load(file)

In [None]:
# Extract the ed_score and jaccard_score values
ed_scores = [item[1]['ed_score'] for item in R4_2T_sorted_mentions]
jaccard_scores = [item[1]['jaccard_score'] for item in R4_2T_sorted_mentions]

# Convert to a pandas DataFrame for easier analysis
df = pd.DataFrame({'ED Score': ed_scores, 'Jaccard Score': jaccard_scores})

# Density Plot for ED Score and Jaccard Score
plt.figure(figsize=(10, 6))

# Plot for ED Score
sns.kdeplot(df['ED Score'], fill=True, label='ED Score')

# Plot for Jaccard Score
sns.kdeplot(df['Jaccard Score'], fill=True, label='Jaccard Score')

plt.xlabel('Score')
plt.ylabel('Density')
plt.title('Density Plot of ED and Jaccard Scores')
plt.legend(loc='upper left')  # Show legend with labels
plt.show()


In [None]:
df.describe()

## Sample extraction

In [16]:
## Sample extraction
# SPLIT OVER THE QUARTILES

n = len(R4_2T_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R4_2T_sorted_mentions[:q1_idx]
q2 = R4_2T_sorted_mentions[q1_idx:q2_idx]
q3 = R4_2T_sorted_mentions[q2_idx:q3_idx]
q4 = R4_2T_sorted_mentions[q3_idx:]

In [17]:
sample_size = int(len(R4_2T_sorted_mentions)/40) 
R4_2T_sample_keys = []
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q1, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q2, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q3, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q4, sample_size)

In [None]:
# Extracting ED scores and Jaccard scores
ed_scores = [score[1]['ed_score'] for score in R4_2T_sample_keys]
jaccard_scores = [score[1]['jaccard_score'] for score in R4_2T_sample_keys]

plt.figure(figsize=(8, 5))

sns.kdeplot(ed_scores, color='skyblue', label='Edit Distance Score', fill=True)
sns.kdeplot(jaccard_scores, color='salmon', label='Jaccard Score', fill=True)

plt.title('Edit Distance and Jaccard Score Density')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend(loc="upper left")

plt.show()

## Coverage Measure

In [None]:
url1 = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

In [18]:
# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, el, string_name_list, url, headers, semaphore, pbar):
    params = {
        'name': str(el),
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 1000,
        'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{el}", "boost": 2.0}}}}}}]}}}}}}'
    }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{el}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', string_name_list[el])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pbar.update(1)  # No need to await here
                   # print(f"{el}-->t{item}")
                    #print("__________________________")
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1


    return 0, 0

async def main(R4_2T_sample_keys, url, pbar):
    string_name_list = {item[1]['name']: item[1]['id'] for item in R4_2T_sample_keys}
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for el in string_name_list.keys():
            tasks.append(process_item(session, el, string_name_list, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for mrr_increment, count in results:
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of R1: {cont_el / len(R4_2T_sample_keys)}")
    print(f"Measure Reciprocal Rank of R1: {m_mrr / len(R4_2T_sample_keys)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(R4_2T_sample_keys))
        asyncio.run(main(R4_2T_sample_keys, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(R4_2T_sample_keys, url, pbar))


 48%|████▊     | 3101/6528 [02:16<02:30, 22.77it/s]

Coverage of R1: 0.47503063725490197
Measure Reciprocal Rank of R1: 0.4591340379901872





### base query 
Coverage of R4_2T: 0.48713235294117646

Measure Reciprocal Rank of R4_2T: 0.4726035539215572

### popularity based query 
Coverage of R1: 0.47503063725490197

Measure Reciprocal Rank of R1: 0.4591340379901872

# Round4

In [None]:
GT_file = './data/dataset_GT/Round4_f3.csv'
chunk_size = 1000  # Adjust this based on your memory constraints

ids = {}
column_names = ["table_name", "row", "col", "url"] 
total_rows = sum(1 for line in open(GT_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk


with tqdm(total=total_iterations) as pbar:
    for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
        items = chunk_GT[chunk_GT['target'] == 1]
        for _, row in items.iterrows():
            
            if row['name'] == "imo 9528017":
                print(row)
                break

            
            ids[row["key"]] = {
                "id": 'https://www.wikidata.org/entity/' + row['id'],
                "name": row['name'],
                "ed_score": row['ed_score'],
                "jaccard_score": row['jaccard_score']
            }
        pbar.update(1)  # Update progress bar for each chunk iteration

print("Processing complete.")

In [None]:
# find the mention in the table
tables = "./data/Dataset/Dataset/Round4_2020/tables/"
cea_file = './data/Dataset/Dataset/Round4_2020/gt/cea.csv'
os.listdir(tables)
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype('str') + " " + df[2].astype('str')
cea_keys = set(df["key"].values)
key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {row+1} {col}"
            if key in cea_keys:
                cell_value = df.iloc[row, col]
                key_to_cell[key] = cell_value


In [None]:
cea_file = './data/Dataset/Dataset/Round4_2020/gt/cea.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        parts = row['url'].split('/')
        wikidata_id = parts[-1]
        num_rows, num_columns = df.shape
        key = f"{row['table_name']} {row['row']} {row['col']}"
        if key in ids:
            cell_value = key_to_cell[key]
            data = ids[key]
            mentions[cell_value] = data

print("Processing complete.")

In [None]:
sorted_mentions = sorted(mentions.items(), key=lambda x: x[1]["ed_score"])

json_file_path = "./data/Round4_sorted_mentions.json"

# Save the sorted_mentions dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(sorted_mentions, json_file, indent=4)

print(f"Sorted mentions saved to {json_file_path}")

In [22]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round4_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R4_sorted_mentions = json.load(file)

In [None]:
# Extract the ed_score and jaccard_score values
ed_scores = [item[1]['ed_score'] for item in R4_sorted_mentions]
jaccard_scores = [item[1]['jaccard_score'] for item in R4_sorted_mentions]

# Convert to a pandas DataFrame for easier analysis
df = pd.DataFrame({'ED Score': ed_scores, 'Jaccard Score': jaccard_scores})

# Density Plot for ED Score and Jaccard Score
plt.figure(figsize=(10, 6))

# Plot for ED Score
sns.kdeplot(df['ED Score'], fill=True, label='ED Score')

# Plot for Jaccard Score
sns.kdeplot(df['Jaccard Score'], fill=True, label='Jaccard Score')

plt.xlabel('Score')
plt.ylabel('Density')
plt.title('Density Plot of ED and Jaccard Scores')
plt.legend(loc='upper left')  # Show legend with labels
plt.show()


In [None]:
df.describe()

## Sample extraction

In [23]:
## Sample extraction
# SPLIT OVER THE QUARTILES

n = len(R4_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R4_sorted_mentions[:q1_idx]
q2 = R4_sorted_mentions[q1_idx:q2_idx]
q3 = R4_sorted_mentions[q2_idx:q3_idx]
q4 = R4_sorted_mentions[q3_idx:]

In [24]:
sample_size = int(len(R4_sorted_mentions)/40) 
R4_sample_keys = []
R4_sample_keys = R4_sample_keys + random.sample(q1, sample_size)
R4_sample_keys = R4_sample_keys + random.sample(q2, sample_size)
R4_sample_keys = R4_sample_keys + random.sample(q3, sample_size)
R4_sample_keys = R4_sample_keys + random.sample(q4, sample_size)

In [None]:

# Extracting ED scores and Jaccard scores
ed_scores = [score[1]['ed_score'] for score in R4_sample_keys]
jaccard_scores = [score[1]['jaccard_score'] for score in R4_sample_keys]

plt.figure(figsize=(8, 5))

sns.kdeplot(ed_scores, color='skyblue', label='Edit Distance Score', fill=True)
sns.kdeplot(jaccard_scores, color='salmon', label='Jaccard Score', fill=True)

plt.title('Edit Distance and Jaccard Score Density')
plt.xlabel('Score')
plt.ylabel('Density')
plt.legend(loc="upper left")

plt.show()

## Coverage Measure

In [None]:
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

In [25]:
# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, el, string_name_list, url, headers, semaphore, pbar):
    params = {
        'name': str(el),
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 1000,
        'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{el}", "boost": 2.0}}}}}}]}}}}}}'
    }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{el}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', string_name_list[el])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pbar.update(1)  # No need to await here
                   # print(f"{el}-->t{item}")
                    #print("__________________________")
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1


    return 0, 0

async def main(R4_sample_keys, url, pbar):
    string_name_list = {item[1]['name']: item[1]['id'] for item in R4_sample_keys}
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for el in string_name_list.keys():
            tasks.append(process_item(session, el, string_name_list, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for mrr_increment, count in results:
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of R1: {cont_el / len(R4_sample_keys)}")
    print(f"Measure Reciprocal Rank of R1: {m_mrr / len(R4_sample_keys)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(R4_sample_keys))
        asyncio.run(main(R4_sample_keys, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(R4_sample_keys, url, pbar))


 97%|█████████▋| 45074/46264 [43:26<01:08, 17.29it/s]  

Coverage of R1: 0.9742780563721252
Measure Reciprocal Rank of R1: 0.9173043835383703





### popularity based query 
Coverage of R1: 0.9742780563721252

Measure Reciprocal Rank of R1: 0.9173043835383703

## Datasets Comparison

In [None]:
def extract_scores(data):
    return [item[1]['ed_score'] for item in data]

ed_scores_R1 = extract_scores(R1_sample_keys)
ed_scores_R3 = extract_scores(R3_sample_keys)
ed_scores_R4 = extract_scores(R4_sample_keys)
ed_scores_R4_2T = extract_scores(R4_2T_sample_keys)

# Plot the KDE plots
plt.figure(figsize=(10, 6))

sns.kdeplot(ed_scores_R1, color='skyblue', label='R1 Edit Distance Score', fill=True)
sns.kdeplot(ed_scores_R3, color='green', label='R3 Edit Distance Score', fill=True)
sns.kdeplot(ed_scores_R4, color='red', label='R4 Edit Distance Score', fill=True)
sns.kdeplot(ed_scores_R4_2T, color='purple', label='R4_2T Edit Distance Score', fill=True)

plt.xlabel('Edit Distance Score')
plt.ylabel('Density')
plt.title('Density Plot of Edit Distance Scores for Different Rounds')
plt.legend(loc='upper left')
plt.show()