In [2]:
! pip install backoff



In [118]:
import asyncio
import aiohttp
import pandas as pd
from tqdm.asyncio import tqdm
import re
import backoff  # Ensure this import is included for the decorator

# R1

In [112]:
url = 'https://lamapi.hel.sintef.cloud/entity/labels?lang=en&token=lamapi_demo_2023'

headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json'
}

R1_file = './data/datasets_alligator_completi/Round1_T2D.csv'
ner_mention = {}
chunk_size = 1000  # Ensure chunk_size is defined globally

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, key, id, semaphore):
    async with semaphore:
        data = {
            "json": [
                id
            ]
        }
        async with session.post(url, headers=headers, json=data) as response:
            response_json = await response.json()
            try:
                ner_mention[key] = response_json[id]['NERtype']
            except:
                ner_mention[key] = None

async def process_chunk(chunk, semaphore):
    tasks = []
    async with aiohttp.ClientSession() as session:
        for _, row in chunk.iterrows():
            key = row['key']
            id = row['id']
            tasks.append(fetch(session, key, id, semaphore))
        await asyncio.gather(*tasks)

async def main():
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    total_rows = sum(1 for line in open(R1_file)) - 1  # Exclude header
    total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk
    
    for chunk in tqdm(pd.read_csv(R1_file, chunksize=chunk_size), total=total_iterations):
        await process_chunk(chunk, semaphore)

    # Combine ner_mention with the original dataset
    key_to_mention = {}
    processed_chunks = []
    total_rows = sum(1 for line in open(R1_file)) - 1  # Exclude header
    total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

    for chunk in tqdm(pd.read_csv(R1_file, chunksize=chunk_size), total=total_iterations):
        mentions = []
        for _, row in chunk.iterrows():
            mention = ner_mention.get(row['key'], None)
            mentions.append(mention)
        chunk.insert(2, 'mention', mentions)  # Insert 'mention' column after 'id'
        processed_chunks.append(chunk)

    final_df = pd.concat(processed_chunks, ignore_index=True)
    print("Processing complete.")

    return final_df

# Run the main function
final_df = await main()


100%|██████████| 80/80 [38:25<00:00, 28.81s/it]
100%|██████████| 80/80 [00:04<00:00, 18.30it/s]


Processing complete.


In [113]:
final_df

Unnamed: 0,tableName,key,mention,id,ambiguity_mention,ncorrects_tokens,ntoken_mention,ntoken_entity,length_mention,length_entity,...,cpa_t3,cpa_t4,cpa_t5,cpa_r1,cpa_r2,cpa_r3,cpa_r4,cpa_r5,group,target
0,28086084_0_3127660530989916727,28086084_0_3127660530989916727-51-0,PERS,Q160922,0.014,1.0,3,3,16,16,...,0.880,0.880,0.840,0.000,0.0,0.0,0.0,0.0,0,1
1,28086084_0_3127660530989916727,28086084_0_3127660530989916727-51-0,PERS,Q171144,0.014,1.0,3,3,16,17,...,0.880,0.880,0.840,0.000,0.0,0.0,0.0,0.0,0,0
2,28086084_0_3127660530989916727,28086084_0_3127660530989916727-51-0,PERS,Q171238,0.014,1.0,3,3,16,17,...,0.880,0.880,0.840,0.000,0.0,0.0,0.0,0.0,0,0
3,28086084_0_3127660530989916727,28086084_0_3127660530989916727-51-0,PERS,Q118081,0.014,1.0,3,3,16,16,...,0.880,0.880,0.840,0.000,0.0,0.0,0.0,0.0,0,0
4,28086084_0_3127660530989916727,28086084_0_3127660530989916727-51-0,PERS,Q356619,0.014,1.0,3,3,16,17,...,0.880,0.800,0.800,0.000,0.0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79269,60319454_0_3938426910282115527,60319454_0_3938426910282115527-47-0,OTHERS,Q10571,0.960,1.0,1,1,4,4,...,0.298,0.106,0.043,0.043,0.0,0.0,0.0,0.0,8075,0
79270,60319454_0_3938426910282115527,60319454_0_3938426910282115527-47-0,OTHERS,Q133772,0.960,1.0,1,1,4,4,...,0.681,0.426,0.298,0.000,0.0,0.0,0.0,0.0,8075,0
79271,60319454_0_3938426910282115527,60319454_0_3938426910282115527-47-0,OTHERS,Q57382,0.960,1.0,1,1,4,4,...,0.681,0.085,0.043,0.000,0.0,0.0,0.0,0.0,8075,0
79272,60319454_0_3938426910282115527,60319454_0_3938426910282115527-47-0,OTHERS,Q44780,0.960,1.0,1,1,4,4,...,0.681,0.106,0.043,0.000,0.0,0.0,0.0,0.0,8075,0


In [114]:
final_df.to_csv(os.path.join('./data/alligator_training_data', 'R1_training_data.csv'), index=False)

# R3

In [None]:
url = 'https://lamapi.hel.sintef.cloud/entity/labels?lang=en&token=lamapi_demo_2023'

headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json'
}

R3_file = './data/datasets_alligator_completi/Round3.csv'
ner_mention = {}
chunk_size = 1000  # Ensure chunk_size is defined globally

# Backoff decorator for handling retries with exponential backoff
@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch(session, key, id, semaphore):
    async with semaphore:
        data = {
            "json": [
                id
            ]
        }
        async with session.post(url, headers=headers, json=data) as response:
            response_json = await response.json()
            try:
                ner_mention[key] = response_json[id]['NERtype']
            except:
                ner_mention[key] = None

async def process_chunk(chunk, semaphore):
    tasks = []
    async with aiohttp.ClientSession() as session:
        for _, row in chunk.iterrows():
            key = row['key']
            id = row['id']
            tasks.append(fetch(session, key, id, semaphore))
        await asyncio.gather(*tasks)

async def main():
    semaphore = asyncio.Semaphore(50)  # Limit to 50 concurrent requests
    total_rows = sum(1 for line in open(R3_file)) - 1  # Exclude header
    total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk
    
    for chunk in tqdm(pd.read_csv(R3_file, chunksize=chunk_size), total=total_iterations):
        await process_chunk(chunk, semaphore)

    # Combine ner_mention with the original dataset
    key_to_mention = {}
    processed_chunks = []
    total_rows = sum(1 for line in open(R3_file)) - 1  # Exclude header
    total_iterations = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division to include last chunk

    for chunk in tqdm(pd.read_csv(R3_file, chunksize=chunk_size), total=total_iterations):
        mentions = []
        for _, row in chunk.iterrows():
            mention = ner_mention.get(row['key'], None)
            mentions.append(mention)
        chunk.insert(2, 'mention', mentions)  # Insert 'mention' column after 'id'
        processed_chunks.append(chunk)

    final_df = pd.concat(processed_chunks, ignore_index=True)
    print("Processing complete.")

    return final_df

# Run the main function
final_df = await main()


In [None]:
final_df.to_csv(os.path.join('./data/alligator_training_data', 'R3_training_data.csv'), index=False)