In [1]:
! pip install backoff



In [2]:
import csv
import requests
import asyncio
import aiohttp
import json
import backoff

In [3]:
def chunks(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

In [4]:
@backoff.on_exception(backoff.expo,  # Use exponential backoff
                      (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError),
                      max_tries=5,  # Retry up to 5 times
                      max_time=300)  # Maximum total time to backoff is 300 seconds
async def fetch(session, url, data, headers, params, semaphore):
    async with semaphore:
        async with session.post(url, json=data, headers=headers, params=params) as response:
            response.raise_for_status()  # Raises an exception for 4XX/5XX status codes
            return await response.json()

async def main():
    url = "https://lamapi.hel.sintef.cloud/entity/objects"

    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    params = {
        "token": "lamapi_demo_2023",
        "kg": "wikidata"
    }
    

    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests

    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:  
        tasks = []    
        chunk_size = 500  # each chunk has 500 items

        for chunk in chunks(urls, chunk_size):
            tasks.append(fetch(session, url, {"json": chunk}, headers, params, semaphore))
        responses = await asyncio.gather(*tasks, return_exceptions=True)
        for response in responses:
            if isinstance(response, Exception):
                print(f"Error: {response}")
            else:
                try:
                    if response['wikidata']:
                        for el in response['wikidata']:
                            flag = True
                            for key, value in response['wikidata'][el]['objects'].items():
                                flag = True
                                if el[0] == 'P':
                                    #print(f"ID: {el} is a predicate {key} --> kind: PREDICATE")
                                    predicate_list.append(el)
                                    flag = False
                                    break
                                if 'P31' in value:
                                    if 'Q4167410' == key:
                                        #print(f"ID: {el} is instance of {key} --> kind: DISAMBIGUATION")
                                        disambiguation_list.append(el)
                                        flag = False
                                        break
                                    elif 'Q4167836' == key:
                                        #print(f"ID: {el} is instance of {key} --> kind: CATEGORY")
                                        category_list.append(el)
                                        flag = False
                                        break
                                if 'P279' in value:
                                    #print(f"ID: {el} is subclass of {key} --> kind: TYPE")
                                    type_list.append(el)
                                    flag = False
                                    break  
    
                            if flag:
                                #print(f"ID: {el} is an ENTITY")
                                entity_list.append(el)
                except json.JSONDecodeError as e:
                    print(f"Failed to parse JSON response: {response}")




In [5]:
files = [
    './my-data/Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_WD.csv',
    #'./my-data/Dataset/Dataset/2T_Round4/gt/cea.csv',
    #'./my-data/Dataset/Dataset/Round3_2019/gt/CEA_Round3_gt_WD.csv',
    #'./my-data/Dataset/Dataset/Round4_2020/gt/cea.csv'
]

kind_list_classification = {}

for file_path  in files:
    with open(file_path , mode='r') as file_gt:
        csv_reader = csv.reader(file_gt)
        urls = []
        
        for row in csv_reader:  
            url = row[3].split(' ')
            for el in url:
                #print(el)
                parts = el.split('/')
                el = parts[-1]
                if el not in urls:
                    urls.append(el)

        type_list = []
        entity_list = []
        predicate_list = []
        disambiguation_list = []
        category_list = []
        print("before main")
        await main()
        print("after main")
    
        start_index = file_path.find('Dataset/') + len('Dataset/')
        end_index = file_path.find('/', start_index) + 1
        next_slash_index = file_path.find('/', end_index)
        name_file = file_path[end_index:next_slash_index]
        kind_list_classification[name_file] = {
            "type_list": type_list,
            "entity_list": entity_list,
            "predicate_list": predicate_list,
            "disambiguation_list": disambiguation_list,
            "category_list": category_list
        }

    file_gt.close()
    

before main
after main


In [6]:
kind_list_classification

{'Round1_T2D': {'type_list': ['Q140',
   'Q161248',
   'Q1816048',
   'Q199674',
   'Q122783',
   'Q173117',
   'Q270642',
   'Q693',
   'Q947251',
   'Q107211',
   'Q1938795',
   'Q4603',
   'Q18498',
   'Q208133',
   'Q54322',
   'Q829043',
   'Q158711',
   'Q34623',
   'Q484447',
   'Q876500',
   'Q130730',
   'Q133006',
   'Q161120',
   'Q204686',
   'Q25332',
   'Q26354',
   'Q54335',
   'Q81893',
   'Q156538',
   'Q164285',
   'Q182803',
   'Q1942487',
   'Q213005',
   'Q1639604',
   'Q208730',
   'Q4598',
   'Q54330',
   'Q1439079',
   'Q214293',
   'Q244813',
   'Q312218',
   'Q876274',
   'Q1044478',
   'Q193098',
   'Q597009',
   'Q33609',
   'Q81666',
   'Q1630964',
   'Q202895',
   'Q368495',
   'Q4856412'],
  'entity_list': ['Q1008351',
   'Q1016838',
   'Q102131',
   'Q1025',
   'Q1029',
   'Q1030',
   'Q103321',
   'Q104123',
   'Q104814',
   'Q1048800',
   'Q1049505',
   'Q1054942',
   'Q1058903',
   'Q1062',
   'Q106316',
   'Q1065711',
   'Q10743709',
   'Q1076784',
 