In [1]:
!pip install backoff
! pip install SPARQLWrapper

Collecting backoff
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Installing collected packages: backoff
Successfully installed backoff-2.2.1
Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.0.0-py3-none-any.whl.metadata (11 kB)
Collecting isodate<0.7.0,>=0.6.0 (from rdflib>=6.1.1->SPARQLWrapper)
  Downloading isodate-0.6.1-py2.py3-none-any.whl.metadata (9.6 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pac

In [2]:
import json
import random
import os
import pandas as pd
from tqdm.asyncio import tqdm
import re
import aiohttp
import asyncio
import backoff
import nest_asyncio
from SPARQLWrapper import SPARQLWrapper, JSON

# Round1

In [4]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round1_T2D_f3_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R1_sorted_mentions = json.load(file)

In [5]:
# SPLIT OVER THE QUARTILES

n = len(R1_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R1_sorted_mentions[:q1_idx]
q2 = R1_sorted_mentions[q1_idx:q2_idx]
q3 = R1_sorted_mentions[q2_idx:q3_idx]
q4 = R1_sorted_mentions[q3_idx:]

sample_size = int(len(R1_sorted_mentions)/40)  
R1_sample_keys = []
R1_sample_keys = R1_sample_keys + random.sample(q1, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q2, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q3, sample_size)
R1_sample_keys = R1_sample_keys + random.sample(q4, sample_size)

q_ids = {item[1]['name']: item[1]['id'] for item in R1_sample_keys}

In [17]:
# find the mention in the table
tables = "./data/Dataset/Dataset/Round1_T2D/tables/"
cta_file = './data/Dataset/Dataset/Round1_T2D/gt/CTA_Round1_gt.csv'
os.listdir(tables)

mapping = {
    "LOC": [
        "Place", "PopulatedPlace", "City", "Country", "Region", "Mountain", "Island", "Lake", "River",
        "Park", "Building", "HistoricPlace", "Monument", "Bridge", "Road", "Airport"
    ],
    "PER": [
        "Person", "Artist", "Athlete", "Politician", "Scientist", "Writer", "Actor", "Musician", "MilitaryPerson",
        "Religious", "Royalty", "Criminal"
    ],
    "ORG": [
        "Organisation", "Company", "EducationalInstitution", "PoliticalParty", "SportsTeam", "Non-ProfitOrganisation",
        "GovernmentAgency", "ReligiousOrganisation", "Band", "Library", "Museum", "Hospital", "University", "TradeUnion"
    ]
}

# Create reverse mapping
reverse_mapping = {v: k for k, values in mapping.items() for v in values}

# Define function to map df[2] values to their categories
def map_class_to_category(class_name):
    return reverse_mapping.get(class_name, "OTHERS")

# Apply the function and create the 'key' column
cta_keys = {}
df = pd.read_csv(cta_file, header=None)
type = df[2].astype(str).str.split('/').str[-1]
df["category"] = type.apply(map_class_to_category)
cta_keys["key"] = (df[0] + " " + df[1].astype('str'), df["category"])

key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {col}"
            if key in set(cta_keys["key"][0].values):
                tmp_index = cta_keys["key"][0].values.tolist().index(key)
                tmp_value = cta_keys["key"][1].iloc[tmp_index]
                key_to_cell[key] = tmp_value

100%|██████████| 64/64 [00:01<00:00, 57.21it/s]


In [20]:
def get_keys_from_value(d, value):
    keys = [key for key, val in d.items() if val == value]
    return keys[0]

In [23]:
cea_file = './data/Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_WD.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

count = 0
for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        key = f"{row['table_name']} {row['col']}"
        if key in key_to_cell.keys() and row["url"] in q_ids.values():
            count += 1
            data = key_to_cell[key]
            mentions[get_keys_from_value(q_ids, row["url"])] = (row["url"], data)

print("Processing complete.")

100%|██████████| 9/9 [00:00<00:00, 11.75it/s]

Processing complete.
607 elements and founds: 853





In [23]:
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, name, value, url, headers, semaphore, pbar):
    
    if value[1] != "OTHERS":
        params = {
            'name': name,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 1000,
            'query' : f'''
            {{
                "query": {{
                    "bool": {{
                        "must": [
                            {{
                                "match": {{
                                    "name": {{
                                        "query": "{name}",
                                        "boost": 2.0
                                    }}
                                }}
                            }}
                        ],
                        "should": [
                            {{
                                "term": {{
                                    "NERtype": "{value[1]}"
                                }}
                            }}
                        ]
                    }}
                }}
            }}
        '''
        }
    else:
        params = {
            'name': name,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 1000,
            'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{name}", "boost": 2.0}}}}}}]}}}}}}'
        }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{name}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', value[0])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pbar.update(1)  # No need to await here
                   # print(f"{el}-->t{item}")
                    #print("__________________________")
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1

        #print(f"{name} NOT FOUND-->t{item}")

    return 0, 0

async def main(mentions, url, pbar):
    string_name_list = mentions
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for name, type in string_name_list.items():
            tasks.append(process_item(session, name, type, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for mrr_increment, count in results:
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of R1: {cont_el / len(mentions)}")
    print(f"Measure Reciprocal Rank of R1: {m_mrr / len(mentions)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(mentions))
        asyncio.run(main(mentions, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(mentions, url, pbar))


 98%|█████████▊| 14864/15222 [13:52<00:20, 17.85it/s] 


Coverage of R1: 0.9764814084877151
Measure Reciprocal Rank of R1: 0.9306191696227784


## Coverage with the filtering
Coverage of R1: 0.9764814084877151

Measure Reciprocal Rank of R1: 0.930619169622778443

# Round3

In [24]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round3_2019_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R3_sorted_mentions = json.load(file)

In [25]:
# SPLIT OVER THE QUARTILES

n = len(R3_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R3_sorted_mentions[:q1_idx]
q2 = R3_sorted_mentions[q1_idx:q2_idx]
q3 = R3_sorted_mentions[q2_idx:q3_idx]
q4 = R3_sorted_mentions[q3_idx:]


sample_size = int(len(R3_sorted_mentions)/40) 
R3_sample_keys = []
R3_sample_keys = R3_sample_keys + random.sample(q1, sample_size)
R3_sample_keys = R3_sample_keys + random.sample(q2, sample_size)
R3_sample_keys = R3_sample_keys + random.sample(q3, sample_size)
R3_sample_keys = R3_sample_keys + random.sample(q4, sample_size)

q_ids = {item[1]['name']: item[1]['id'] for item in R3_sample_keys}

In [26]:
# find the mention in the table
tables = "./data/Dataset/Dataset/Round3_2019/tables/"
cta_file = './data/Dataset/Dataset/Round3_2019/gt/CTA_Round3_gt.csv'
os.listdir(tables)


# Apply the function and create the 'key' column
cta_keys = {}
df = pd.read_csv(cta_file, header=None)
category_list = []

for row_idx in range(df.shape[0]):
    col_idx = 2
    while True:
        try:
            if pd.isna(df.iloc[row_idx,col_idx]):
                category_list.append("OTHERS")
                break
            urls = df.iloc[row_idx,col_idx].split(' ')
        except IndexError as e:
            category_list.append("OTHERS")
            break
        
        #print(f"{df.iloc[row_idx,0]}->{cell_urls} @ {row_idx},{col_idx}")
        find = False
        for url in urls:
            type = url.split('/')[-1]            
            if type == "Person":
                category_list.append("PER")
                find = True
                break
            elif type == "Location":
                category_list.append("LOC")
                find = True
                break
            elif type == "Organisation":
                category_list.append("ORG")
                find = True
                break
        if find:
            break
        
        col_idx += 1



df["category"] = category_list
cta_keys = {}
cta_keys["key"] = (df[0] + " " + df[1].astype('str'), df["category"])

key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {col}"
            if key in set(cta_keys["key"][0].values):
                tmp_index = cta_keys["key"][0].values.tolist().index(key)
                tmp_value = cta_keys["key"][1].iloc[tmp_index]
                key_to_cell[key] = tmp_value

100%|██████████| 2162/2162 [05:32<00:00,  6.50it/s]


In [27]:
def get_keys_from_value(d, value):
    keys = [key for key, val in d.items() if val == value]
    return keys[0]

In [28]:
cea_file = './data/Dataset/Dataset/Round3_2019/gt/CEA_Round3_gt_WD.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        key = f"{row['table_name']} {row['col']}"
        if key in key_to_cell.keys() and row["url"] in q_ids.values():
            data = key_to_cell[key]
            mentions[get_keys_from_value(q_ids, row["url"])] = (row["url"], data)

print("Processing complete.")

100%|██████████| 391/391 [07:15<00:00,  1.11s/it]

Processing complete.





In [29]:
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, name, value, url, headers, semaphore, pbar):
    if value[1] != "OTHERS":
        params = {
            'name': name,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 1000,
            'query' : f'''
            {{
                "query": {{
                    "bool": {{
                        "must": [
                            {{
                                "match": {{
                                    "name": {{
                                        "query": "{name}",
                                        "boost": 2.0
                                    }}
                                }}
                            }}
                        ],
                        "should": [
                            {{
                                "term": {{
                                    "NERtype": "{value[1]}"
                                }}
                            }}
                        ]
                    }}
                }}
            }}
        '''
        }
    else:
        params = {
            'name': name,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 1000,
            'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{name}", "boost": 2.0}}}}}}]}}}}}}'
        }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{name}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', value[0])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pbar.update(1)  # No need to await here
                   # print(f"{el}-->t{item}")
                    #print("__________________________")
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1


    return 0, 0

async def main(mentions, url, pbar):
    string_name_list = mentions
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for name, type in string_name_list.items():
            tasks.append(process_item(session, name, type, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for mrr_increment, count in results:
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of R1: {cont_el / len(mentions)}")
    print(f"Measure Reciprocal Rank of R1: {m_mrr / len(mentions)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(mentions))
        asyncio.run(main(mentions, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(mentions, url, pbar))


 97%|█████████▋| 14836/15233 [23:25<00:37, 10.55it/s] 


Coverage of R1: 0.9739381605724414
Measure Reciprocal Rank of R1: 0.9310733932907487


## Coverage with the filteringCoverage of R1: 0.9739381605724414

Measure Reciprocal Rank of R1: 0.9310733932907487
6

# 2T_Round4

In [30]:
####################
# READ THE JSON
#####################

json_file_path = "./data/2T_Round4_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R4_2T_sorted_mentions = json.load(file)

In [31]:
## Sample extraction
# SPLIT OVER THE QUARTILES

n = len(R4_2T_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R4_2T_sorted_mentions[:q1_idx]
q2 = R4_2T_sorted_mentions[q1_idx:q2_idx]
q3 = R4_2T_sorted_mentions[q2_idx:q3_idx]
q4 = R4_2T_sorted_mentions[q3_idx:]

sample_size = int(len(R4_2T_sorted_mentions)/40) 
R4_2T_sample_keys = []
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q1, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q2, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q3, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q4, sample_size)

q_ids = {item[1]['name']: item[1]['id'] for item in R4_2T_sample_keys}

In [None]:
# find the mention in the table
tables = "./data/Dataset/Dataset/2T_Round4/tables/"
cta_file = './data/Dataset/Dataset/2T_Round4/gt/cta.csv'
os.listdir(tables)

mapping = {
    "LOC": [
        "Q17334923",  # Place
        "Q486972",    # PopulatedPlace
        "Q618123",    # Geographical Feature
        "Q515",       # City
        "Q6256",      # Country
        "Q82794",     # Region
        "Q8502",      # Mountain
        "Q23442",     # Island
        "Q23397",     # Lake
        "Q4022",      # River
        "Q22698",     # Park
        "Q41176",     # Building
        "Q839954",    # HistoricPlace
        "Q4989906",   # Monument
        "Q12280",     # Bridge
        "Q34442",     # Road
        "Q1248784",   # Airport
        "Q13226383",  # Residential area
        "Q13221722",  # Industrial area
        "Q164238",    # Forest
        "Q55488",     # Desert
        "Q3065569",   # Archaeological site
        "Q14752696",  # Campus
        "Q1022083",   # Tourist attraction
        "Q207694",    # World Heritage Site
        "Q4989906",   # Monument
        "Q2065736",   # Memorial
        "Q10864048",  # Castle
        "Q23413",     # Palace
        "Q838948",    # Estate
    ],
    "PER": [
        "Q215627",    # Person
        "Q5",         # Human
        "Q483501",    # Artist
        "Q2066131",   # Athlete
        "Q82955",     # Politician
        "Q901",       # Scientist
        "Q36180",     # Writer
        "Q33999",     # Actor
        "Q639669",    # Musician
        "Q6581097",   # MilitaryPerson
        "Q947873",    # Religious
        "Q12078",     # Royalty
        "Q1456951",   # Criminal
        "Q36834",     # Philosopher
        "Q49757",     # Director
        "Q622425",    # Producer
        "Q333634",    # Businessperson
        "Q937857",    # Entrepreneur
        "Q3282637",   # Inventor
        "Q10871364",  # Fashion designer
        "Q4834541",   # Journalist
        "Q36180",     # Poet
        "Q170790",    # Diplomat
        "Q376799",    # Chef
        "Q24238356",  # Influencer
        "Q207694",    # Celebrity
        "Q10864048",  # Explorer
    ],
    "ORG": [
        "Q43229",     # Organisation
        "Q783794",    # Company
        "Q3918",      # EducationalInstitution
        "Q7278",      # PoliticalParty
        "Q12973014",  # SportsTeam
        "Q163740",    # Non-ProfitOrganisation
        "Q327333",    # GovernmentAgency
        "Q1969448",   # ReligiousOrganisation
        "Q215380",    # Band
        "Q7075",      # Library
        "Q33506",     # Museum
        "Q16917",     # Hospital
        "Q216052",    # TradeUnion
        "Q6501447",   # Research Institute
        "Q163740",    # Foundation
        "Q200695",    # Bank
        "Q19317",     # Corporation
        "Q875538",    # Law firm
        "Q2234766",   # Professional association
        "Q2088357",   # Charity
        "Q163740",    # NGO (Non-Governmental Organization)
        "Q484652",    # Think tank
        "Q507619",    # Multinational corporation
        "Q4438121",   # Media company
        "Q156537",    # Publishing company
        "Q11426",     # Airline
        "Q980447",    # Hotel chain
        "Q1131088",   # Technology company
    ]
}


# Create reverse mapping
reverse_mapping = {v: k for k, values in mapping.items() for v in values}

# Define function to map df[2] values to their categories
def map_class_to_category(class_name):
    return reverse_mapping.get(class_name, "OTHERS")

# Apply the function and create the 'key' column
cta_keys = {}
df = pd.read_csv(cta_file, header=None)

category_list = []

for row_idx in range(df.shape[0]):
    col_idx = 2
    while True:
        try:
            if pd.isna(df.iloc[row_idx,col_idx]):
                category_list.append("OTHERS")
                break
            urls = df.iloc[row_idx,col_idx].split(' ')
        except IndexError as e:
            category_list.append("OTHERS")
            break
        
        #print(f"{df.iloc[row_idx,0]}->{cell_urls} @ {row_idx},{col_idx}")
        urls = df.iloc[row_idx,col_idx].split(' ')
        find = False
        for url in urls:
            type = url.split('/')[-1]     
            map_class_to_category(type)
            if type == "Person":
                category_list.append("PER")
                find = True
                break
            elif type == "Location":
                category_list.append("LOC")
                find = True
                break
            elif type == "Organisation":
                category_list.append("ORG")
                find = True
                break
        if find:
            break
        
        col_idx += 1

type = df[2].astype(str).str.split('/').str[-1]
df["category"] = type.apply(map_class_to_category)
cta_keys["key"] = (df[0] + " " + df[1].astype('str'), df["category"])

key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {col}"
            if key in set(cta_keys["key"][0].values):
                tmp_index = cta_keys["key"][0].values.tolist().index(key)
                tmp_value = cta_keys["key"][1].iloc[tmp_index]
                key_to_cell[key] = tmp_value

In [52]:
# find the mention in the table
tables = "./data/Dataset/Dataset/2T_Round4/tables/"
cta_file = './data/Dataset/Dataset/2T_Round4/gt/cta.csv'
os.listdir(tables)

mapping = {
    "LOC": [
        "Q17334923",  # Place
        "Q486972",    # PopulatedPlace
        "Q618123",    # Geographical Feature
        "Q515",       # City
        "Q6256",      # Country
        "Q82794",     # Region
        "Q8502",      # Mountain
        "Q23442",     # Island
        "Q23397",     # Lake
        "Q4022",      # River
        "Q22698",     # Park
        "Q41176",     # Building
        "Q839954",    # HistoricPlace
        "Q4989906",   # Monument
        "Q12280",     # Bridge
        "Q34442",     # Road
        "Q1248784",   # Airport
        "Q13226383",  # Residential area
        "Q13221722",  # Industrial area
        "Q164238",    # Forest
        "Q55488",     # Desert
        "Q3065569",   # Archaeological site
        "Q14752696",  # Campus
        "Q1022083",   # Tourist attraction
        "Q207694",    # World Heritage Site
        "Q4989906",   # Monument
        "Q2065736",   # Memorial
        "Q10864048",  # Castle
        "Q23413",     # Palace
        "Q838948",    # Estate
    ],
    "PER": [
        "Q215627",    # Person
        "Q5",         # Human
        "Q483501",    # Artist
        "Q2066131",   # Athlete
        "Q82955",     # Politician
        "Q901",       # Scientist
        "Q36180",     # Writer
        "Q33999",     # Actor
        "Q639669",    # Musician
        "Q6581097",   # MilitaryPerson
        "Q947873",    # Religious
        "Q12078",     # Royalty
        "Q1456951",   # Criminal
        "Q36834",     # Philosopher
        "Q49757",     # Director
        "Q622425",    # Producer
        "Q333634",    # Businessperson
        "Q937857",    # Entrepreneur
        "Q3282637",   # Inventor
        "Q10871364",  # Fashion designer
        "Q4834541",   # Journalist
        "Q36180",     # Poet
        "Q170790",    # Diplomat
        "Q376799",    # Chef
        "Q24238356",  # Influencer
        "Q207694",    # Celebrity
        "Q10864048",  # Explorer
    ],
    "ORG": [
        "Q43229",     # Organisation
        "Q783794",    # Company
        "Q3918",      # EducationalInstitution
        "Q7278",      # PoliticalParty
        "Q12973014",  # SportsTeam
        "Q163740",    # Non-ProfitOrganisation
        "Q327333",    # GovernmentAgency
        "Q1969448",   # ReligiousOrganisation
        "Q215380",    # Band
        "Q7075",      # Library
        "Q33506",     # Museum
        "Q16917",     # Hospital
        "Q216052",    # TradeUnion
        "Q6501447",   # Research Institute
        "Q163740",    # Foundation
        "Q200695",    # Bank
        "Q19317",     # Corporation
        "Q875538",    # Law firm
        "Q2234766",   # Professional association
        "Q2088357",   # Charity
        "Q163740",    # NGO (Non-Governmental Organization)
        "Q484652",    # Think tank
        "Q507619",    # Multinational corporation
        "Q4438121",   # Media company
        "Q156537",    # Publishing company
        "Q11426",     # Airline
        "Q980447",    # Hotel chain
        "Q1131088",   # Technology company
    ]
}


def get_item_root(mapping, id_list):
    # Flatten the list of target IDs
    target_ids = [item for sublist in mapping.values() for item in sublist]
    
    # Define the SPARQL endpoint
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    
    # Define the SPARQL query
    id_list_str = " ".join("wd:" + id_ for id_ in id_list)
    target_ids_str = " ".join("wd:" + tid for tid in target_ids)
    query = """
        SELECT ?item ?class WHERE {
          VALUES ?item { %s }
          ?item wdt:P279* ?class.
          VALUES ?class { %s }
        }
    """ % (id_list_str, target_ids_str)
    
    # Set the query and request JSON response
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    try:
        # Execute the query and convert the result to JSON
        results = sparql.query().convert()
    except Exception as e:
        print(f"Error executing SPARQL query: {e}")
        return {}
    
    
    # Extract the classes from the results
    id_to_root_class = {}
    for result in results["results"]["bindings"]:
        item_id = result["item"]["value"].split('/')[-1]
        root_class = result["class"]["value"].split('/')[-1]
        id_to_root_class[item_id] = root_class
    
    return id_to_root_class

def get_item_root_batch(mapping, ids, batch_size=50):
    root_classes = []
    with tqdm(total=len(ids), desc="SPARQL queries") as pbar:
        for i in range(0, len(ids), batch_size):
            batch_ids = ids[i:i+batch_size]
            batch_root_classes = get_item_root(mapping, batch_ids)
            root_classes.extend([(root_class, id_) for id_, root_class in batch_root_classes.items()])
            pbar.update(len(batch_ids))
    return root_classes

# Create reverse mapping
reverse_mapping = {v: k for k, values in mapping.items() for v in values}

# Define function to map df[2] values to their categories
def map_class_to_category(class_name):
    return reverse_mapping.get(class_name, "OTHERS")

# Apply the function and create the 'key' column
root_classes = []
df = pd.read_csv(cta_file, header=None)
types = df[2].astype(str).str.split(' ')


ids = []
for el in types:
    ids.append(el[0].split('/')[-1])
root_classes = get_item_root_batch(mapping, list(ids))

# Map root classes to categories
root_categories = [(map_class_to_category(root_id), id_) for root_id, id_ in root_classes]


df["category"] = root_categories
cta_keys = {}
cta_keys["key"] = (df[0] + " " + df[1].astype('str'), df["category"])

key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {col}"
            if key in set(cta_keys["key"][0].values):
                tmp_index = cta_keys["key"][0].values.tolist().index(key)
                tmp_value = cta_keys["key"][1].iloc[tmp_index]
                print(tmp_value)
                break
                key_to_cell[key] = (tmp_value, cta_keys["key"][2].iloc[tmp_index])

SPARQL queries: 100%|██████████| 540/540 [00:07<00:00, 76.41it/s]


ValueError: Length of values (46) does not match length of index (540)

In [42]:
key_to_cell

{'00E2H310 0': 'OTHERS',
 '00E2H310 1': 'OTHERS',
 '00E2H310 2': 'OTHERS',
 '0AJSJYAL 0': 'OTHERS',
 '0D70DN48 0': 'OTHERS',
 '0D70DN48 2': 'OTHERS',
 '0D70DN48 3': 'OTHERS',
 '0H1C2CNE 0': 'OTHERS',
 '0IR0XIUW 0': 'OTHERS',
 '0IR0XIUW 1': 'ORG',
 '0IR0XIUW 2': 'OTHERS',
 '1C7N45JA 0': 'ORG',
 '1C7N45JA 1': 'OTHERS',
 '1C7N45JA 2': 'OTHERS',
 '1C7N45JA 3': 'OTHERS',
 '1C7N45JA 4': 'OTHERS',
 '1C7N45JA 5': 'OTHERS',
 '1C9LFOKN 0': 'OTHERS',
 '1C9LFOKN 1': 'OTHERS',
 '1MQL5T7F 0': 'OTHERS',
 '1MQL5T7F 1': 'OTHERS',
 '1MQL5T7F 5': 'OTHERS',
 '1MQL5T7F 7': 'OTHERS',
 '24W5SSRB 0': 'OTHERS',
 '24W5SSRB 2': 'OTHERS',
 '24W5SSRB 3': 'OTHERS',
 '24W5SSRB 4': 'OTHERS',
 '29BNEL1Q 0': 'OTHERS',
 '29BNEL1Q 1': 'OTHERS',
 '29BNEL1Q 2': 'ORG',
 '29BNEL1Q 3': 'LOC',
 '2BEBH437 0': 'OTHERS',
 '2BEBH437 1': 'OTHERS',
 '2BEBH437 2': 'OTHERS',
 '2EZKB5RU 0': 'OTHERS',
 '2EZKB5RU 1': 'OTHERS',
 '2EZKB5RU 2': 'OTHERS',
 '2VRS31OV 0': 'OTHERS',
 '2VRS31OV 1': 'OTHERS',
 '2VRS31OV 2': 'OTHERS',
 '2XHMV76G 1

In [122]:
def get_keys_from_value(d, value):
    keys = [key for key, val in d.items() if val == value]
    return keys[0]

In [123]:
cea_file = './data/Dataset/Dataset/2T_Round4/gt/cea.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        key = f"{row['table_name']} {row['col']}"
        if key in key_to_cell.keys() and row["url"].split(" ")[0] in q_ids.values():
            data = key_to_cell[key]
            mentions[get_keys_from_value(q_ids, row["url"])] = (row["url"], data)

print("Processing complete.")

100%|██████████| 668/668 [01:53<00:00,  5.90it/s]

Processing complete.





# Round4

In [53]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round4_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R4_sorted_mentions = json.load(file)

In [54]:
## Sample extraction
# SPLIT OVER THE QUARTILES

n = len(R4_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R4_sorted_mentions[:q1_idx]
q2 = R4_sorted_mentions[q1_idx:q2_idx]
q3 = R4_sorted_mentions[q2_idx:q3_idx]
q4 = R4_sorted_mentions[q3_idx:]

sample_size = int(len(R4_sorted_mentions)/40) 
R4_sample_keys = []
R4_sample_keys = R4_sample_keys + random.sample(q1, sample_size)
R4_sample_keys = R4_sample_keys + random.sample(q2, sample_size)
R4_sample_keys = R4_sample_keys + random.sample(q3, sample_size)
R4_sample_keys = R4_sample_keys + random.sample(q4, sample_size)

q_ids = {item[1]['name']: item[1]['id'] for item in R4_sample_keys}

In [None]:
# find the mention in the table
tables = "./data/Dataset/Dataset/Round4_2020/tables/"
cta_file = './data/Dataset/Dataset/Round4_2020/gt/cta.csv'
os.listdir(tables)

mapping = {
    "LOC": [
        "Q17334923",  # Place
        "Q486972",    # PopulatedPlace
        "Q618123",    # Geographical Feature
        "Q515",       # City
        "Q6256",      # Country
        "Q82794",     # Region
        "Q8502",      # Mountain
        "Q23442",     # Island
        "Q23397",     # Lake
        "Q4022",      # River
        "Q22698",     # Park
        "Q41176",     # Building
        "Q839954",    # HistoricPlace
        "Q4989906",   # Monument
        "Q12280",     # Bridge
        "Q34442",     # Road
        "Q1248784",   # Airport
        "Q13226383",  # Residential area
        "Q13221722",  # Industrial area
        "Q164238",    # Forest
        "Q55488",     # Desert
        "Q3065569",   # Archaeological site
        "Q14752696",  # Campus
        "Q1022083",   # Tourist attraction
        "Q207694",    # World Heritage Site
        "Q4989906",   # Monument
        "Q2065736",   # Memorial
        "Q10864048",  # Castle
        "Q23413",     # Palace
        "Q838948",    # Estate
    ],
    "PER": [
        "Q215627",    # Person
        "Q5",         # Human
        "Q483501",    # Artist
        "Q2066131",   # Athlete
        "Q82955",     # Politician
        "Q901",       # Scientist
        "Q36180",     # Writer
        "Q33999",     # Actor
        "Q639669",    # Musician
        "Q6581097",   # MilitaryPerson
        "Q947873",    # Religious
        "Q12078",     # Royalty
        "Q1456951",   # Criminal
        "Q36834",     # Philosopher
        "Q49757",     # Director
        "Q622425",    # Producer
        "Q333634",    # Businessperson
        "Q937857",    # Entrepreneur
        "Q3282637",   # Inventor
        "Q10871364",  # Fashion designer
        "Q4834541",   # Journalist
        "Q36180",     # Poet
        "Q170790",    # Diplomat
        "Q376799",    # Chef
        "Q24238356",  # Influencer
        "Q207694",    # Celebrity
        "Q10864048",  # Explorer
    ],
    "ORG": [
        "Q43229",     # Organisation
        "Q783794",    # Company
        "Q3918",      # EducationalInstitution
        "Q7278",      # PoliticalParty
        "Q12973014",  # SportsTeam
        "Q163740",    # Non-ProfitOrganisation
        "Q327333",    # GovernmentAgency
        "Q1969448",   # ReligiousOrganisation
        "Q215380",    # Band
        "Q7075",      # Library
        "Q33506",     # Museum
        "Q16917",     # Hospital
        "Q216052",    # TradeUnion
        "Q6501447",   # Research Institute
        "Q163740",    # Foundation
        "Q200695",    # Bank
        "Q19317",     # Corporation
        "Q875538",    # Law firm
        "Q2234766",   # Professional association
        "Q2088357",   # Charity
        "Q163740",    # NGO (Non-Governmental Organization)
        "Q484652",    # Think tank
        "Q507619",    # Multinational corporation
        "Q4438121",   # Media company
        "Q156537",    # Publishing company
        "Q11426",     # Airline
        "Q980447",    # Hotel chain
        "Q1131088",   # Technology company
    ]
}

def get_item_root(mapping, id_list):
    # Flatten the list of target IDs
    target_ids = [item for sublist in mapping.values() for item in sublist]
    
    # Define the SPARQL endpoint
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    
    # Define the SPARQL query
    id_list_str = " ".join("wd:" + id_ for id_ in id_list)
    target_ids_str = " ".join("wd:" + tid for tid in target_ids)
    query = """
    SELECT ?item ?class WHERE {
      VALUES ?item { %s }
      ?item wdt:P279* ?class.
      VALUES ?class { %s }
    }
    """ % (id_list_str, target_ids_str)
    
    # Set the query and request JSON response
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    # Execute the query and convert the result to JSON
    results = sparql.query().convert()
    
    # Extract the classes from the results
    id_to_root_class = {}
    for result in results["results"]["bindings"]:
        item_id = result["item"]["value"].split('/')[-1]
        root_class = result["class"]["value"].split('/')[-1]
        id_to_root_class[item_id] = root_class
    
    return id_to_root_class

def get_item_root_batch(mapping, ids, batch_size=50):
    root_classes = {}
    with tqdm(total=len(ids), desc="SPARQL queries") as pbar:
        for i in range(0, len(ids), batch_size):
            batch_ids = ids[i:i+batch_size]
            batch_root_classes = get_item_root(mapping, batch_ids)
            root_classes.update(batch_root_classes)
            pbar.update(len(batch_ids))
    return [root_classes.get(id_, None) for id_ in ids]

# Create reverse mapping
reverse_mapping = {v: k for k, values in mapping.items() for v in values}

# Define function to map df[2] values to their categories
def map_class_to_category(class_name):
    return reverse_mapping.get(class_name, "OTHERS")

# Apply the function and create the 'key' column
root_classes = []
df = pd.read_csv(cta_file, header=None)
ids = [url.split('/')[-1] for url in df[2]]

root_classes = get_item_root_batch(mapping, ids)

# Map root classes to categories
root_categories = [map_class_to_category(root_id) for root_id in root_classes]


df["category"] = root_categories
cta_keys = {}
cta_keys["key"] = (df[0] + " " + df[1].astype('str'), df["category"])

key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {col}"
            if key in set(cta_keys["key"][0].values):
                tmp_index = cta_keys["key"][0].values.tolist().index(key)
                tmp_value = cta_keys["key"][1].iloc[tmp_index]
                key_to_cell[key] = tmp_value

SPARQL queries: 100%|██████████| 31922/31922 [10:06<00:00, 52.67it/s]
 63%|██████▎   | 13994/22207 [1:06:49<5:38:02,  2.47s/it]

In [None]:
def get_keys_from_value(d, value):
    keys = [key for key, val in d.items() if val == value]
    return keys[0]

In [None]:
cea_file = './data/Dataset/Dataset/Round4_2020/gt/cea.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

total_rows = sum(1 for line in open(cea_file)) - 1  # Exclude header
total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

for chunk_cea in tqdm(pd.read_csv(cea_file, chunksize=chunk_size), total=total_iterations):
    chunk_cea.columns = column_names
    for _, row in chunk_cea.iterrows():
        key = f"{row['table_name']} {row['col']}"
        if key in key_to_cell.keys() and row["url"] in q_ids.values():
            print("found")
            data = key_to_cell[key]
            mentions[get_keys_from_value(q_ids, row["url"])] = (row["url"], data)

print("Processing complete.")

In [33]:
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except Exception as e:
                return []

async def process_item(session, name, value, url, headers, semaphore, pbar):
    params = {
        'name': name,
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'limit': 1000,
        'query': f'''
        {{
            "query": {{
                "bool": {{
                    "must": [
                        {{
                            "match": {{
                                "name": {{
                                    "query": "{name}",
                                    "boost": 2.0
                                }}
                            }}
                        }},
                        {{
                            "term": {{
                                "NERtype": "{value[1]}"
                            }}
                        }}
                    ]
                }}
            }}
        }}
        '''
    }

    try:
        data = await fetch(session, url, params, headers, semaphore)
    except ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{name}'")
            pbar.update(1)  # No need to await here
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            GT_id_match = re.search(r'Q(\d+)$', value[0])
            if GT_id_match:
                GT_id = GT_id_match[0]
                if GT_id == item.get('id'):
                    pbar.update(1)  # No need to await here
                   # print(f"{el}-->t{item}")
                    #print("__________________________")
                    pos_score = item.get('pos_score', 0)
                    if pos_score:
                        mrr_increment = (num_result - (pos_score * num_result)) / num_result
                    else:
                        mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                    return mrr_increment, 1

    return 0, 0

async def main(mentions, url, pbar):
    string_name_list = mentions
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0
    async with aiohttp.ClientSession() as session:
        tasks = []
        for name, type in string_name_list.items():
            tasks.append(process_item(session, name, type, url, headers, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for mrr_increment, count in results:
            m_mrr += mrr_increment
            cont_el += count

        pbar.close()  # No need to await here

    print(f"Coverage of R4: {cont_el / len(mentions)}")
    print(f"Measure Reciprocal Rank of R4: {m_mrr / len(mentions)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(mentions))
        asyncio.run(main(mentions, url, pbar))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(mentions, url, pbar))


0it [00:00, ?it/s]


ZeroDivisionError: division by zero