In [1]:
! pip install backoff

Collecting backoff
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Installing collected packages: backoff
Successfully installed backoff-2.2.1


In [2]:
import csv
import requests
import asyncio
import aiohttp
import json
import backoff

In [3]:
def chunks(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

In [4]:
@backoff.on_exception(backoff.expo,  # Use exponential backoff
                      (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError),
                      max_tries=5,  # Retry up to 5 times
                      max_time=300)  # Maximum total time to backoff is 300 seconds
async def fetch(session, url, data, headers, params, semaphore):
    async with semaphore:
        async with session.post(url, json=data, headers=headers, params=params) as response:
            response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
            return await response.json()

async def main():
    url = "https://lamapi.hel.sintef.cloud/entity/objects"

    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    params = {
        "token": "lamapi_demo_2023",
        "kg": "wikidata"
    }
    

    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests

    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:  
        tasks = []    
        chunk_size = 80  # each chunk has 500 items

        for chunk in chunks(urls, chunk_size):
            tasks.append(fetch(session, url, {"json": chunk}, headers, params, semaphore))
        responses = await asyncio.gather(*tasks, return_exceptions=True)
        for response in responses:
            if isinstance(response, Exception):
                print(f"Error: {response}")
            else:
                try:
                    if response['wikidata']:
                        for el in response['wikidata']:
                            flag = True
                            for key, value in response['wikidata'][el]['objects'].items():
                                flag = True
                                if el[0] == 'P':
                                    #print(f"ID: {el} is a predicate {key} --> kind: PREDICATE")
                                    predicate_list.append(el)
                                    flag = False
                                    break
                                if 'P31' in value:
                                    if 'Q4167410' == key:
                                        #print(f"ID: {el} is instance of {key} --> kind: DISAMBIGUATION")
                                        disambiguation_list.append(el)
                                        flag = False
                                        break
                                    elif 'Q4167836' == key:
                                        #print(f"ID: {el} is instance of {key} --> kind: CATEGORY")
                                        category_list.append(el)
                                        flag = False
                                        break
                                if 'P279' in value:
                                    #print(f"ID: {el} is subclass of {key} --> kind: TYPE")
                                    type_list.append(el)
                                    flag = False
                                    break  
    
                            if flag:
                                #print(f"ID: {el} is an ENTITY")
                                entity_list.append(el)
                except json.JSONDecodeError as e:
                    print(f"Failed to parse JSON response: {response}")




In [None]:
files = [
    './my-data/Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_WD.csv',
    './my-data/Dataset/Dataset/2T_Round4/gt/cea.csv',
    './my-data/Dataset/Dataset/Round3_2019/gt/CEA_Round3_gt_WD.csv',
    './my-data/Dataset/Dataset/Round4_2020/gt/cea.csv'
]

kind_list_classification = {}

for file_path  in files:
    with open(file_path , mode='r') as file_gt:
        csv_reader = csv.reader(file_gt)
        urls = []
        
        for row in csv_reader:  
            url = row[3].split(' ')
            for el in url:
                #print(el)
                parts = el.split('/')
                el = parts[-1]
                if el not in urls:
                    urls.append(el)

        type_list = []
        entity_list = []
        predicate_list = []
        disambiguation_list = []
        category_list = []
        
        await main()
    
    
        start_index = file_path.find('Dataset/') + len('Dataset/')
        end_index = file_path.find('/', start_index) + 1
        next_slash_index = file_path.find('/', end_index)
        name_file = file_path[end_index:next_slash_index]
        kind_list_classification[name_file] = {
            "type_list": type_list,
            "entity_list": entity_list,
            "predicate_list": predicate_list,
            "disambiguation_list": disambiguation_list,
            "category_list": category_list
        }

    file_gt.close()
    

In [None]:
json_file_path = "./kind_list_classification.json"

# Save dictionary as JSON
with open(json_file_path, "w") as json_file:
    json.dump(kind_list_classification, json_file)


In [21]:
from collections import defaultdict
import json

with open('C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/HT2_failed_queries_HARD.json', 'r') as f:
    failed_queries_hard = json.load(f)

with open('C:/ALESSANDRO/Università/MAGISTRALE/SINTEF_thesis/lamAPI/data/HT2_failed_queries_SOFT.json', 'r') as f:
    failed_queries_soft = json.load(f)

In [22]:
print(f"failed_queries_hard: {len(failed_queries_hard)} vs failed_queries_soft: {len(failed_queries_soft)}")

failed_queries_hard: 528 vs failed_queries_soft: 380


In [23]:
missing_values = set(failed_queries_hard.keys()) - set(failed_queries_soft.keys())
print(len(missing_values))

151


In [11]:
import json
import random
import os
import pandas as pd
import re
import aiohttp
import asyncio
import backoff
import nest_asyncio
import time
from SPARQLWrapper import SPARQLWrapper, JSON
from requests import get
import numpy as np
import requests
from aiohttp import ClientResponseError
import logging
from tqdm import tqdm


In [24]:
url = 'https://lamapi.hel.sintef.cloud/entity/labels?token=lamapi_demo_2023'

# Define the headers
headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json'
}


# Initialize a dictionary to count mismatches by category
mismatch_categories = defaultdict(int)



for el in missing_values:
    data = {
        'json': [el]
    }

    json_data = json.dumps(data)
    response = requests.post(url, headers=headers, data=json_data)
    if len(response.json()) == 0:
        continue
    
    true_ner = response.json()[el]
    print(f"{failed_queries_hard[el][0]} vs True NER: {true_ner['NERtype']} - {true_ner['labels']['en']} ({true_ner['description']})")

    if failed_queries_hard[el][0] != failed_queries_hard[el][0]:
        # Create a category key for the mismatch
        category = f"{failed_queries_hard[el][0]} != {failed_queries_hard[el][1]}"
        #print(category)
        # Increment the count for this mismatch category
        mismatch_categories[category] += 1

# Print the counts for each mismatch category
for category, count in mismatch_categories.items():
    print(f"Mismatch category '{category}': {count} occurrences")

OTHERS vs True NER: ORG - Underwood & Underwood (American photo studio and photo agency)
OTHERS vs True NER: ORG - Slate (U.S.-based online magazine)
LOC vs True NER: OTHERS - Altare della Patria (monument built in honour of Victor Emmanuel II of Italy)
ORG vs True NER: OTHERS - The Henrietta Barnett School (school in Barnet, UK)
OTHERS vs True NER: LOC - Gold Hill (city in Oregon, USA)
OTHERS vs True NER: LOC - Milionův buk (memorable tree in Brno-venkov District, Czech Republic)
OTHERS vs True NER: LOC - Lake Geneva (city in Walworth County, Wisconsin, United States)
OTHERS vs True NER: LOC - Arco di Trevi (an opus quadratum arch in Latium)
OTHERS vs True NER: LOC - SMS Derfflinger (battlecruiser)
LOC vs True NER: ORG - People's Socialist Republic of Albania (Marxist-Leninist government of Albania (1946–1992))
ORG vs True NER: LOC - Asahi (town in Shimoniikawa district, Toyama prefecture, Japan)
ORG vs True NER: LOC - Naka (town in Naka district, Tokushima prefecture, Japan)
OTHERS v

In [16]:
missing_values

{'Q1086913',
 'Q1110876',
 'Q162667',
 'Q16553',
 'Q1876560',
 'Q1935584',
 'Q2029466',
 'Q2306',
 'Q2560090',
 'Q2574546',
 'Q2639042',
 'Q2668651',
 'Q2876725',
 'Q28993',
 'Q3241019',
 'Q36008',
 'Q6746',
 'Q678',
 'Q732342',
 'Q847869',
 'Q892401'}