In [6]:
import json
import random
import os
import pandas as pd
import re
import aiohttp
import asyncio
import backoff
import nest_asyncio
import time
from SPARQLWrapper import SPARQLWrapper, JSON
from requests import get
import numpy as np
import requests
from aiohttp import ClientResponseError
import logging
from tqdm import tqdm


In [None]:
####################
# READ THE JSON
#####################

json_file_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/2T_Round4_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R4_2T_sorted_mentions = json.load(file)


## Sample extraction
# SPLIT OVER THE QUARTILES

n = len(R4_2T_sorted_mentions)
q1_idx = n // 4
q2_idx = n // 2
q3_idx = 3 * n // 4

# Step 3: Split the list into quartiles
q1 = R4_2T_sorted_mentions[:q1_idx]
q2 = R4_2T_sorted_mentions[q1_idx:q2_idx]
q3 = R4_2T_sorted_mentions[q2_idx:q3_idx]
q4 = R4_2T_sorted_mentions[q3_idx:]

sample_size = 1000
R4_2T_sample_keys = []
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q1, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q2, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q3, sample_size)
R4_2T_sample_keys = R4_2T_sample_keys + random.sample(q4, sample_size)

q_ids = {item[1]['name']: item[1]['id'] for item in R4_2T_sample_keys}

In [None]:
len(R4_2T_sorted_mentions)

In [None]:
def get_wikidata_item_tree_item_idsSPARQL(root_items, forward_properties=None, backward_properties=None):
    """Return ids of WikiData items, which are in the tree spanned by the given root items and claims relating them
        to other items.

    :param root_items: iterable[int] One or multiple item entities that are the root elements of the tree
    :param forward_properties: iterable[int] | None property-claims to follow forward; that is, if root item R has
        a claim P:I, and P is in the list, the search will branch recursively to item I as well.
    :param backward_properties: iterable[int] | None property-claims to follow in reverse; that is, if (for a root
        item R) an item I has a claim P:R, and P is in the list, the search will branch recursively to item I as well.
    :return: iterable[int]: List with ids of WikiData items in the tree
    """

    query = '''PREFIX wikibase: <http://wikiba.se/ontology#>
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>'''
    if forward_properties:
        query +='''SELECT ?WD_id WHERE {
                  ?tree0 (wdt:P%s)* ?WD_id .
                  BIND (wd:%s AS ?tree0)
                  }'''%( ','.join(map(str, forward_properties)),','.join(map(str, root_items)))
    elif backward_properties:
        query+='''SELECT ?WD_id WHERE {
                    ?WD_id (wdt:P%s)* wd:Q%s .
                    }'''%(','.join(map(str, backward_properties)), ','.join(map(str, root_items)))
    #print(query)

    url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    data = get(url, params={'query': query, 'format': 'json'}).json()
    
    ids = []
    for item in data['results']['bindings']:
        this_id=item["WD_id"]["value"].split("/")[-1].lstrip("Q")
        #print(item)
        try:
            this_id = int(this_id)
            ids.append(this_id)
            #print(this_id)
        except ValueError:
            #print("exception")
            continue
    return ids


try:
    organization_subclass = get_wikidata_item_tree_item_idsSPARQL([43229], backward_properties=[279])
    #print(len(organization_subclass))
except json.decoder.JSONDecodeError:
    pass

try:
    country_subclass = get_wikidata_item_tree_item_idsSPARQL([6256], backward_properties=[279])
except json.decoder.JSONDecodeError:
    country_subclass = set()
    pass

try:
    city_subclass = get_wikidata_item_tree_item_idsSPARQL([515], backward_properties=[279])
except json.decoder.JSONDecodeError:
    city_subclass = set()
    pass

try:
    capitals_subclass = get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])
except json.decoder.JSONDecodeError:
    capitals_subclass = set()
    pass

try:
    admTerr_subclass = get_wikidata_item_tree_item_idsSPARQL([15916867], backward_properties=[279])
except json.decoder.JSONDecodeError:
    admTerr_subclass = set()
    pass

try:
    family_subclass = get_wikidata_item_tree_item_idsSPARQL([17350442], backward_properties=[279])
except json.decoder.JSONDecodeError:
    family_subclass = set()
    pass

try:
    sportLeague_subclass = get_wikidata_item_tree_item_idsSPARQL([623109], backward_properties=[279])
except json.decoder.JSONDecodeError:
    sportLeague_subclass = set()
    pass

try:
    venue_subclass = get_wikidata_item_tree_item_idsSPARQL([8436], backward_properties=[279])
except json.decoder.JSONDecodeError:
    venue_subclass = set()
    pass
    
try:
    organization_subclass = list(set(organization_subclass) - set(country_subclass) - set(city_subclass) - set(capitals_subclass) - set(admTerr_subclass) - set(family_subclass) - set(sportLeague_subclass) - set(venue_subclass))
    #print(len(organization_subclass))
except json.decoder.JSONDecodeError:
    pass


try:
    geolocation_subclass = get_wikidata_item_tree_item_idsSPARQL([2221906], backward_properties=[279])
    #print(len(geolocation_subclass))
except json.decoder.JSONDecodeError:
    pass

try:
    food_subclass = get_wikidata_item_tree_item_idsSPARQL([2095], backward_properties=[279])
except json.decoder.JSONDecodeError:
    food_subclass = set()
    pass

try:
    edInst_subclass = get_wikidata_item_tree_item_idsSPARQL([2385804], backward_properties=[279])
except json.decoder.JSONDecodeError:
    edInst_subclass = set()
    pass

try:
    govAgency_subclass = get_wikidata_item_tree_item_idsSPARQL([327333], backward_properties=[279])
except json.decoder.JSONDecodeError:
    govAgency_subclass = set()
    pass

try:
    intOrg_subclass = get_wikidata_item_tree_item_idsSPARQL([484652], backward_properties=[279])
except json.decoder.JSONDecodeError:
    intOrg_subclass = set()
    pass

try:
    timeZone_subclass = get_wikidata_item_tree_item_idsSPARQL([12143], backward_properties=[279])
except json.decoder.JSONDecodeError:
    timeZone_subclass = set()
    pass
   
try:
    organization_subclass = list(set(organization_subclass) | set(edInst_subclass) | set(govAgency_subclass) | set(intOrg_subclass))
    geolocation_subclass = list(set(geolocation_subclass) | set(country_subclass) | set(city_subclass) | set(capitals_subclass) | set(admTerr_subclass))
    geolocation_subclass = list(set(geolocation_subclass) - set(food_subclass) - set(edInst_subclass) - set(govAgency_subclass) - set(intOrg_subclass) - set(timeZone_subclass))
   #print(len(geolocation_subclass))
except json.decoder.JSONDecodeError:
    pass

try:
    human_subclass = get_wikidata_item_tree_item_idsSPARQL([5], backward_properties=[279])
except json.decoder.JSONDecodeError:
    human_subclass = set()
    pass

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import logging

tables_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/2T_Round4/tables/"
cea_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/2T_Round4/gt/cea.csv'
os.listdir(tables_path)
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype(str) + " " + df[2].astype(str)
cea_values_dict = dict(zip(df["key"].values, df[3].values))
cea_keys_set = set(df["key"].values)

# Function to process a single table file
def process_table_file(table_file):
    try:
        table_name = os.path.splitext(os.path.basename(table_file))[0]
        df = pd.read_csv(table_file)
        local_key_to_cell = {}
        
        for row in range(df.shape[0]):
            for col in range(df.shape[1]):
                key = f"{table_name} {row+1} {col}"
                if key in cea_keys_set:
                    cell_value = df.iloc[row, col]
                    local_key_to_cell[key] = (cell_value, cea_values_dict[key])
                    break  # Exit inner loop early as only one match per row/col is needed
        
        return local_key_to_cell
    except Exception as e:
        logging.error(f"Error processing {table_file}: {e}")
        return {}

# List of table files
table_files = [os.path.join(tables_path, table) for table in os.listdir(tables_path)]

# Process tables sequentially
key_to_cell = {}
for table_file in tqdm(table_files, desc="Processing tables"):
    local_key_to_cell = process_table_file(table_file)
    key_to_cell.update(local_key_to_cell)


In [None]:
tables = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/2T_Round4/tables/"
cea_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/2T_Round4/gt/cea.csv'
cta_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/2T_Round4/gt/cta.csv'
os.listdir(tables)

def get_item_root(id_list):     
    id_to_root_class = {}
    for el in id_list:
        inst_item = int(re.search(r'(\d+)$', el)[0])
        if inst_item in geolocation_subclass:
            #id_to_root_class[el] = "LOC"
            return "LOC"
        elif inst_item in organization_subclass:
            #id_to_root_class[el] = "ORG"
            return "ORG"
        elif inst_item in human_subclass:
            #id_to_root_class[el] = "PERS"
            return "PERS"      
    
    return "OTHERS"

# Apply the function and create the 'key' column
root_classes = []
df = pd.read_csv(cta_file, header=None)
root_categories = []
for urls in df[2]:
    tmp = [url.split('/')[-1] for url in urls.split(" ")]
    root_categories.append(get_item_root(tmp))




df["category"] = root_categories
cta_keys = {}
cta_keys["key"] = (df[0] + " " + df[1].astype('str'), df["category"])

ner_type = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {col}"
            if key in set(cta_keys["key"][0].values):
                tmp_index = cta_keys["key"][0].values.tolist().index(key)
                tmp_value = cta_keys["key"][1].iloc[tmp_index]
                ner_type[key] = tmp_value

In [None]:
### in case you want HT2

tables_path = "C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/tables/"
cea_file = 'C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/Dataset/Dataset/HardTablesR2/gt/cea.csv'
os.listdir(tables_path)
# Initialize logging
#logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype(str) + " " + df[2].astype(str)
cea_values_dict = dict(zip(df["key"].values, df[3].values))
cea_keys_set = set(df["key"].values)

# Function to process a single table file
def process_table_file(table_file):
    try:
        table_name = os.path.splitext(os.path.basename(table_file))[0]
        df = pd.read_csv(table_file)
        local_key_to_cell = {}
        
        for row in range(df.shape[0]):
            for col in range(df.shape[1]):
                key = f"{table_name} {row+1} {col}"
                if key in cea_keys_set:
                    cell_value = df.iloc[row, col]
                    local_key_to_cell[key] = (cell_value, cea_values_dict[key])
                    break  # Exit inner loop early as only one match per row/col is needed
        
        return local_key_to_cell
    except Exception as e:
        logging.error(f"Error processing {table_file}: {e}")
        return {}

pattern = r'^\.'

# Create a list of file paths, excluding files that start with a dot
table_files = [os.path.join(tables_path, table) for table in os.listdir(tables_path) if not re.match(pattern, table)]

# Process tables sequentially
key_to_cell = {}
for table_file in tqdm(table_files, desc="Processing tables"):
    local_key_to_cell = process_table_file(table_file)
    key_to_cell.update(local_key_to_cell)



with open('C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/work/HT2_ner_type.json', 'r') as f:
    ner_type = json.load(f)

In [None]:
### in case you want R4

with open('C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/work/data/R4_ner_type_new.json', 'r') as f:
    ner_type = json.load(f)

with open('C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/work/R4_key_to_cell.json', 'r') as f:
    key_to_cell = json.load(f)

In [2]:
import json

# Specify the path to the JSON file
file_path = "./R4_ner_type_new.json"

# Open and read the JSON file
with open(file_path, 'r') as f:
    ner_type = json.load(f)

with open('./R4_key_to_cell.json', 'r') as f:
    key_to_cell = json.load(f)

# Now key_to_cell contains the dictionary loaded from the JSON file
print("Dictionary loaded from JSON file:")


Dictionary loaded from JSON file:


In [3]:

from collections import Counter

category_counts = Counter(ner_type.values())

# Display the counts for each category
for category, count in category_counts.items():
    print(f'{category}: {count}')

OTHERS: 13306
PERS: 1607
LOC: 260
ORG: 211


In [None]:
sample_size = 10000
key_to_cell_sample = dict(random.sample(list(key_to_cell.items()), sample_size))

In [None]:
import aiohttp
import backoff
import asyncio
import re
import pandas as pd
from collections import Counter
from tqdm.asyncio import tqdm

rows = []

url = 'https://lamapi.hel.sintef.cloud/entity/labels?token=lamapi_demo_2023'

headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json'
}

# Define the async function to fetch data with retries
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch_data(session, url, data):
    async with session.post(url, headers=headers, json=data) as response:
        # Check the content type of the response
        content_type = response.headers.get('Content-Type', '').lower()
        
        if 'application/json' not in content_type:
            print(f"Unexpected content type: {content_type}. URL: {url}")
            return None
        
        return await response.json()

# Main async function to process mentions
async def process_mentions():
    async with aiohttp.ClientSession() as session:
        for key, (text, id) in tqdm(key_to_cell_sample.items(), desc='Processing mentions', unit='item'):
            match = re.search(r'Q(\d+)$', id)
            
            if not match:
                continue
            
            data = {'json': [match[0]]}

            try:
                response_json = await fetch_data(session, url, data)
                
                if response_json is None or len(response_json) == 0:
                    continue

                
                desc = response_json[match[0]]['description']
                if desc == None:
                    continue
            
                label = response_json[match[0]]['NERtype']
                new_row = {'text': text, 'label': label, 'desc': desc}

                rows.append(new_row)
            except KeyError:
                continue

# Run the async function
await (process_mentions())

# Create DataFrame from the list of rows
df = pd.DataFrame(rows)
category_counts = Counter(df['label'])

# Display the counts for each category
for category, count in category_counts.items():
    print(f'{category}: {count}')
    
#df.to_csv('C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/embedding_training_data/Round3_train.csv', index=False)


In [None]:

df.to_csv('C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/embedding_training_data/HT2_train.csv', index=False)

In [None]:
sample_size = 4000
key_to_cell_sample = dict(random.sample(list(key_to_cell.items()), sample_size))

In [9]:
def get_query(name, value):
    name = str(name).replace('"', ' ')
    if value is not None:
        # Soft filtering constraint
        query_dict = {
            "query": {
                "bool": {
                    "must": [
                        {"match": {"name": {"query": name, "boost": 2.0}}}
                    ],
                    "should": [
                        {"term": {"NERtype": value}}
                    ]
                }
            }
        }
        params = {
            'name': name,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 1000,
            'query': json.dumps(query_dict),  # Convert the query dictionary to a JSON string
            'sort': [
                '{"popularity": {"order": "desc"}}'
            ]
        }
    
    return params

url = 'https://lamapi.hel.sintef.cloud/entity/labels?token=lamapi_demo_2023'

# Define the headers
headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json'
}

queries = []
key_to_cell_sample = {}
for key in tqdm(key_to_cell):
    id_table, _, id_col = key.split(" ")
    name = key_to_cell[key][0]
    q_ids = key_to_cell[key][1].split(' ')
    new_key = f"{id_table} {id_col}"
    if new_key in ner_type:
        key_to_cell_sample[key] = key_to_cell[key]
        NER_type = ner_type[new_key]
        query = get_query(name, NER_type)
        
        matched_results = []
        for q_id in q_ids:            
            match = re.search(r'Q(\d+)$', q_id)
            if not match:
                continue
            data = {
                'json': [match[0]]
            }

            json_data = json.dumps(data)
            response = requests.post(url, headers=headers, data=json_data)
            if len(response.json()) == 0:
                continue
            
            true_ner = response.json()[match[0]]['NERtype']
            break

        if match:
            data = json.loads(query['query'])
            ner_type_list = data['query']['bool']['should'][0]['term']['NERtype']
            # ner_type_list is the ner column
            queries.append((query, match[0],ner_type_list, true_ner))
            if len(queries) == 4000:
                break


  2%|▏         | 8411/475897 [22:05<20:27:49,  6.35it/s]  


In [10]:
len(key_to_cell_sample)

4000

In [None]:
key_to_cell_sample = {key: value for key, value in key_to_cell.items() if value[1].split('/')[-1] in missing_values}
# just for 2T missing values comparison

In [15]:
def get_query(name, value):
    name = str(name).replace('"', ' ')

    if value is not None:
        # Hard filtering constraint
        query_dict = {
            "query": {
                "bool": {
                    "must": [
                        {"match": {"name": {"query": name, "boost": 2.0}}},
                        {"term": {"NERtype": value}}
                    ]
                }
            }
        }
        params = {
            'name': name,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 1000,
            'query': json.dumps(query_dict),  # Convert the query dictionary to a JSON string
            'sort': [
                '{"popularity": {"order": "desc"}}'
            ]
        }    

    return params

url = 'https://lamapi.hel.sintef.cloud/entity/labels?token=lamapi_demo_2023'

# Define the headers
headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json'
}

queries = []
for key in tqdm(key_to_cell_sample):
    id_table, _, id_col = key.split(" ")
    name = key_to_cell[key][0]
    q_ids = key_to_cell[key][1].split(' ')
    new_key = f"{id_table} {id_col}"
    if new_key in ner_type:
        NER_type = ner_type[new_key]
        if NER_type is None:
            print(f"q_ids: {q_ids}, ner_type key: {new_key}")
        query = get_query(name, NER_type)
        
        matched_results = []
        for q_id in q_ids:
            match = re.search(r'Q(\d+)$', q_id)
            if not match:
                continue
            data = {
                'json': [match[0]]
            }

            json_data = json.dumps(data)
            response = requests.post(url, headers=headers, data=json_data)
            if len(response.json()) == 0:
                continue
            
            true_ner = response.json()[match[0]]['NERtype']
            break

        if match:
            data = json.loads(query['query'])
            ner_type_list = data['query']['bool']['must'][1]['term']['NERtype']
            # ner_type_list is the ner column
            queries.append((query, match[0],ner_type_list,true_ner))
            if len(queries) == 4000:
                break



100%|█████████▉| 3999/4000 [13:28<00:00,  4.95it/s]


In [16]:
import aiohttp
import asyncio
import backoff
import nest_asyncio
import random
from tqdm import tqdm
import numpy as np

# Assume queries is a list of tuples [(param1, id1), (param2, id2), ...]

failed_queries = {}
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=10, 
    max_time=400
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        # Convert all params to str, int, or float
        #params = {k: (int(v) if isinstance(v, np.integer) else str(v)) for k, v in params.items()}
        async with session.get(url, params=params, headers=headers, timeout=50) as response:
            try:
                response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
                return await response.json()
            except asyncio.TimeoutError:
                print(f"Request timed out for params: {params}")
                return []  # Return an empty list to handle the timeout gracefully
            except aiohttp.ClientError as e:
                print(f"ClientError for params {params}: {str(e)}")
                return []
            except Exception as e:
                print(f"Unexpected error for params {params}: {str(e)}")
                return []
async def process_item(session, url, id, headers, params, semaphore, pbar):
    try:
        data = await fetch(session, url, params, headers, semaphore)
    except aiohttp.ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{id}'")
            asyncio.get_event_loop().call_soon_threadsafe(pbar.update, 1)
            return 0, 0
        else:
            raise  # Re-raise the exception for other status codes

    num_result = len(data) if data else 0

    if data:
        for item in data:
            if id == item.get('id'):
                asyncio.get_event_loop().call_soon_threadsafe(pbar.update, 1)
                pos_score = item.get('pos_score', 0)
                if pos_score:
                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                else:
                    mrr_increment = 1 / num_result  # Assume worst case for MRR if pos_score is 0
                return mrr_increment, 1

    return 0, 0

async def main(queries, url, pbar, failed_queries):
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    m_mrr = 0
    cont_el = 0

    async with aiohttp.ClientSession() as session:
        tasks = []
        for param, id, _, _ in queries:
            tasks.append(process_item(session, url, id, headers, param, semaphore, pbar))
        
        results = await asyncio.gather(*tasks)
        
        for (mrr_increment, count), (param, id, col_NERtype, item_NERtype) in zip(results, queries):
            if mrr_increment == 0 and count == 0:
                failed_queries[id] = (col_NERtype, item_NERtype)
            else:
                m_mrr += mrr_increment
                cont_el += count

        asyncio.get_event_loop().call_soon_threadsafe(pbar.close)

    print(f"Coverage of 2T: {cont_el / len(queries)}")
    print(f"Measure Reciprocal Rank of 2T: {m_mrr / len(queries)}")

# Check if there's already a running event loop
if __name__ == "__main__":
    nest_asyncio.apply()  # Apply nest_asyncio
    try:
        pbar = tqdm(total=len(queries))
        asyncio.run(main(queries, url, pbar, failed_queries))
    except RuntimeError:  # For environments like Jupyter
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(queries, url, pbar, failed_queries))


 10%|█         | 417/4000 [02:15<12:28,  4.79it/s]  

Request timed out for params: {'name': 'Regional Centre for Biotechnology Act, 2016', 'token': 'lamapi_demo_2023', 'kg': 'wikidata', 'limit': 1000, 'query': '{"query": {"bool": {"must": [{"match": {"name": {"query": "Regional Centre for Biotechnology Act, 2016", "boost": 2.0}}}, {"term": {"NERtype": "OTHERS"}}]}}}', 'sort': ['{"popularity": {"order": "desc"}}']}


 73%|███████▎  | 2905/4000 [14:15<05:22,  3.40it/s]  

Coverage of 2T: 0.72625
Measure Reciprocal Rank of 2T: 0.6855012499999815





In [17]:
len(failed_queries)

1092

In [18]:
with open('./data/Round4_failed_queries_HARD.json', 'w') as json_file:
    json.dump(failed_queries, json_file, indent=4)

In [19]:
with open('./data/Round4_failed_queries_HARD.json', 'r') as f:
    failed_queries_hard = json.load(f)

with open('./data/Round4_failed_queries_SOFT.json', 'r') as f:
    failed_queries_soft = json.load(f)

In [20]:
print(f"failed_queries_hard: {len(failed_queries_hard)} vs failed_queries_soft: {len(failed_queries_soft)}")

failed_queries_hard: 1092 vs failed_queries_soft: 268


In [21]:
column_type = []

for key in tqdm(key_to_cell):
    id_table, _, id_col = key.split(" ")
    name = key_to_cell[key][0]
    q_ids = key_to_cell[key][1].split(' ')
    new_key = f"{id_table} {id_col}"
    if new_key in ner_type:
        NER_type = ner_type[new_key]
        if NER_type is not None:
            column_type.append((q_ids, NER_type))
            #print(f"q_ids: {q_ids}, ner_type column: {NER_type}")

100%|██████████| 475897/475897 [00:07<00:00, 62398.35it/s] 


In [22]:
from collections import defaultdict
import json

# Initialize a dictionary to count mismatches by category
mismatch_categories = defaultdict(int)
tmp = []

# Iterate through the queries
for p, id, ner, ner_col in queries:
    # Check if the ID is in the missing values list
    if id in missing_values :
        # Parse the query JSON
        data = json.loads(p['query'])
        # Extract the NER type list
        ner_type_list = data['query']['bool']['must'][1]['term']['NERtype']
        # Print the NERtype and column NER for this ID
        print(f"id: {id} has item NERtype: {ner}, column NER: {ner_col}")
        # Compare item NERtype with column NER and count mismatches
        if ner != ner_col:
            # Create a category key for the mismatch
            category = f"{ner} != {ner_col}"
            # Increment the count for this mismatch category
            mismatch_categories[category] += 1

# Print the counts for each mismatch category
for category, count in mismatch_categories.items():
    print(f"Mismatch category '{category}': {count} occurrences")


NameError: name 'missing_values' is not defined

In [None]:
len(queries)

In [None]:
tmp = []
for p, id, ner, ner_col in queries:
    if id in missing_values:
        data = json.loads(p['query'])
        ner_type_list = data['query']['bool']['must'][1]['term']['NERtype']
        print(f"id: {id} has item NERtype: {ner}, column NER: {ner_col}")

# 2T
id: Q328446 has item NERtype: LOC, column NER: ORG

id: Q741830 has item NERtype: LOC, column NER: ORG

id: Q2276193 has item NERtype: LOC, column NER: ORG

id: Q988934 has item NERtype: LOC, column NER: OTHERS

id: Q153195 has item NERtype: LOC, column NER: ORG

id: Q2415851 has item NERtype: LOC, column NER: ORG

id: Q2387130 has item NERtype: LOC, column NER: ORG

id: Q233129 has item NERtype: LOC, column NER: OTHERS

id: Q732342 has item NERtype: LOC, column NER: ORG

id: Q3241019 has item NERtype: LOC, column NER: ORG

id: Q1998298 has item NERtype: LOC, column NER: OTHERS

id: Q755226 has item NERtype: LOC, column NER: ORG

In [None]:
for el in missing_values:
    print(f"id: {'http://wikidata.org/entity/'+el} has true NERtype: {failed_queries_hard[el]}")

In [25]:
from collections import defaultdict
import json

# Initialize a dictionary to count mismatches by category
mismatch_categories = defaultdict(int)



for el in missing_values:
    if failed_queries_hard[el][0] != failed_queries_hard[el][1]:
        # Create a category key for the mismatch
        category = f"{failed_queries_hard[el][0]} != {failed_queries_hard[el][1]}"
        #print(category)
        # Increment the count for this mismatch category
        mismatch_categories[category] += 1

# Print the counts for each mismatch category
for category, count in mismatch_categories.items():
    print(f"Mismatch category '{category}': {count} occurrences")

Mismatch category 'OTHERS != LOC': 557 occurrences
Mismatch category 'OTHERS != ORG': 261 occurrences
Mismatch category 'ORG != LOC': 7 occurrences


In [24]:
# 19 entities doesn’t match due to the Hard filtering constraint

missing_values = set(failed_queries_hard) - set(failed_queries_soft)
print(len(missing_values))

826
