In [None]:
! pip install backoff

In [2]:
import asyncio
import aiohttp
import pandas as pd
import os
from tqdm.asyncio import tqdm
import re
import json
import backoff  # Ensure this import is included for the decorator

In [3]:
## mapping definition

PERS = 0
LOC = 1
ORG = 2
OTHERS = 4
none = 4

entity_to_id = {
    'PERS': PERS,
    'LOC': LOC,
    'ORG': ORG,
    'OTHERS': OTHERS,
    None: none
}


In [None]:
directory = './data/datasets_alligator_completi'

# Initialize an empty set to hold all unique ids
all_unique_ids = set()

# Define the chunk size
chunk_size = 10000  # Adjust this value as needed

# Get a list of all CSV files in the directory
csv_files = [filename for filename in os.listdir(directory) if filename.endswith(".csv")]

# Iterate over all files in the directory with a progress bar
for filename in tqdm(csv_files, desc="Processing files"):
    # Construct full file path
    file_path = os.path.join(directory, filename)
    
    # Initialize a set for unique ids in the current file
    unique_ids_in_file = set()
    
    # Read the DataFrame in chunks
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        # Get the unique ids from the current chunk
        unique_ids = set(chunk['id'])
        
        # Update the set of unique ids in the current file
        unique_ids_in_file.update(unique_ids)
        
        # Update the set of all unique ids
        all_unique_ids.update(unique_ids)
    
    # Print the number of unique ids in the current file and total accumulated unique ids
    print(f"File: {filename}, Unique IDs in File: {len(unique_ids_in_file)}, Total Unique IDs: {len(all_unique_ids)}")

# Print the final total number of unique ids across all files
print(f"Final Total unique IDs across all files: {len(all_unique_ids)}")

In [None]:
url = 'https://lamapi.hel.sintef.cloud/entity/labels'
params = {
    'lang': 'en',
    'token': 'lamapi_demo_2023'
}

headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json'
}

id_to_ner = {}

# Semaphore to limit concurrent requests
semaphore = asyncio.Semaphore(70)

@backoff.on_exception(
    backoff.expo, 
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError), 
    max_tries=5, 
    max_time=300
)
async def fetch_with_backoff(session, url, payload, id):
    async with semaphore:
        async with session.post(url, params=params, headers=headers, json=payload) as response:
            return id, await response.json()

async def process_chunk(session, chunk):
    tasks = []
    for unique_id in chunk:
        payload = {
            "json": [
                f"{unique_id}"
            ]
        }
        tasks.append(fetch_with_backoff(session, url, payload, unique_id))
    
    responses = []
    for result in asyncio.as_completed(tasks):
        response = await result
        responses.append(response)
    
    return responses

async def main():

    all_unique_ids_list = list(all_unique_ids)
    
    # Chunk size
    chunk_size = 10000

    # Split IDs into chunks
    chunks = [all_unique_ids_list[i:i + chunk_size] for i in range(0, len(all_unique_ids_list), chunk_size)]

    progress_bar = tqdm(total=len(chunks), desc="Processing Chunks")
    
    for chunk_idx, chunk in enumerate(chunks):
        async with aiohttp.ClientSession() as session:
            responses = await process_chunk(session, chunk)
            for id, response in responses:
                try:
                    id_to_ner[id] = response[id]['NERtype']
                except:
                    id_to_ner[id] = None                  

        
        # Save dictionary to a JSON file after each chunk
        with open('id_to_ner.json', 'w') as json_file:
            json.dump(id_to_ner, json_file)
        
        progress_bar.update(1)  # Update progress bar
    
    progress_bar.close()  # Close progress bar once done

if __name__ == "__main__":
    await (main())


In [4]:
json_file_path = "./id_to_ner.json"
# Load the JSON file
with open(json_file_path, "r") as file:
    id_to_ner = json.load(file)

# R1

In [4]:
# find the mention in the table
tables = "./data/Dataset/Dataset/Round1_T2D/tables/"
cta_file = './data/Dataset/Dataset/Round1_T2D/gt/CTA_Round1_gt.csv'
os.listdir(tables)

mapping = {
    "LOC": [
        "Place", "PopulatedPlace", "City", "Country", "Region", "Mountain", "Island", "Lake", "River",
        "Park", "Building", "HistoricPlace", "Monument", "Bridge", "Road", "Airport"
    ],
    "PERS": [
        "Person", "Artist", "Athlete", "Politician", "Scientist", "Writer", "Actor", "Musician", "MilitaryPerson",
        "Religious", "Royalty", "Criminal"
    ],
    "ORG": [
        "Organisation", "Company", "EducationalInstitution", "PoliticalParty", "SportsTeam", "Non-ProfitOrganisation",
        "GovernmentAgency", "ReligiousOrganisation", "Band", "Library", "Museum", "Hospital", "University", "TradeUnion"
    ]
}

# Create reverse mapping
reverse_mapping = {v: k for k, values in mapping.items() for v in values}

# Define function to map df[2] values to their categories
def map_class_to_category(class_name):
    return reverse_mapping.get(class_name, "OTHERS")

# Apply the function and create the 'key' column
cta_keys = {}
df = pd.read_csv(cta_file, header=None)
type = df[2].astype(str).str.split('/').str[-1]
df["category"] = type.apply(map_class_to_category)
cta_keys["key"] = (df[0] + " " + df[1].astype('str'), df["category"])

key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {col}"
            if key in set(cta_keys["key"][0].values):
                tmp_index = cta_keys["key"][0].values.tolist().index(key)
                tmp_value = cta_keys["key"][1].iloc[tmp_index]
                key_to_cell[key] = tmp_value

100%|██████████| 64/64 [00:02<00:00, 25.14it/s]


In [9]:
csv_file = './data/datasets_alligator_completi/Round1_T2D.csv'

chunk_size = 10000 

# Create an iterator to read the CSV file in chunks
chunk_iterator = pd.read_csv(csv_file, chunksize=chunk_size)
total_rows = sum(1 for row in open(csv_file)) - 1
progress_bar = tqdm(total=total_rows)
processed_chunks = []

for chunk in chunk_iterator:
    mentions = []
    key_columns = []
    for _, row in chunk.iterrows():
        key = row['key']
        id = row['id']
        key_column = f"{key.split('-')[0]} {key.split('-')[2]}"
        ner_mention = entity_to_id[id_to_ner.get(id, None)]
        ner_column = entity_to_id[key_to_cell.get(key_column, None)]
        mentions.append(ner_mention)
        key_columns.append(ner_column)
    
    chunk.insert(2, 'ner_mention', mentions)  # Insert 'mention' column after 'id'
    chunk.insert(3, 'ner_column', key_columns)  # Insert 'key_column_mention' after 'mention'
    processed_chunks.append(chunk)
    progress_bar.update(chunk.shape[0])



# Concatenate all processed chunks to form the final DataFrame
final_df = pd.concat(processed_chunks, ignore_index=True)

# Optionally, save the final DataFrame to a new CSV file
final_df.to_csv('./data/alligator_training_data/Round1_T2D_training_data.csv', index=False)

# Close the progress bar
progress_bar.close()

100%|██████████| 79274/79274 [00:07<00:00, 10928.02it/s]


# R3

In [10]:
# find the mention in the table
tables = "./data/Dataset/Dataset/Round3_2019/tables/"
cta_file = './data/Dataset/Dataset/Round3_2019/gt/CTA_Round3_gt.csv'
os.listdir(tables)


# Apply the function and create the 'key' column
cta_keys = {}
df = pd.read_csv(cta_file, header=None)
category_list = []

for row_idx in range(df.shape[0]):
    col_idx = 2
    while True:
        try:
            if pd.isna(df.iloc[row_idx,col_idx]):
                category_list.append("OTHERS")
                break
            urls = df.iloc[row_idx,col_idx].split(' ')
        except IndexError as e:
            category_list.append("OTHERS")
            break
        
        #print(f"{df.iloc[row_idx,0]}->{cell_urls} @ {row_idx},{col_idx}")
        find = False
        for url in urls:
            type = url.split('/')[-1]            
            if type == "Person":
                category_list.append("PERS")
                find = True
                break
            elif type == "Location":
                category_list.append("LOC")
                find = True
                break
            elif type == "Organisation":
                category_list.append("ORG")
                find = True
                break
        if find:
            break
        
        col_idx += 1



df["category"] = category_list
cta_keys = {}
cta_keys["key"] = (df[0] + " " + df[1].astype('str'), df["category"])

key_to_cell = {}
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            key = f"{table_name} {col}"
            if key in set(cta_keys["key"][0].values):
                tmp_index = cta_keys["key"][0].values.tolist().index(key)
                tmp_value = cta_keys["key"][1].iloc[tmp_index]
                key_to_cell[key] = tmp_value

100%|██████████| 2161/2161 [09:59<00:00,  3.60it/s]


In [None]:
csv_file = './data/datasets_alligator_completi/Round3.csv'

chunk_size = 10000 

# Create an iterator to read the CSV file in chunks
chunk_iterator = pd.read_csv(csv_file, chunksize=chunk_size)
total_rows = sum(1 for row in open(csv_file)) - 1
progress_bar = tqdm(total=total_rows)
processed_chunks = []

for chunk in chunk_iterator:
    mentions = [] 
    key_columns = []
    for _, row in chunk.iterrows():
        key = row['key']
        id = row['id']
        key_column = f"{key.split('-')[0]} {key.split('-')[2]}"
        ner_mention = entity_to_id[id_to_ner.get(id, None)]
        ner_column = entity_to_id[key_to_cell.get(key_column, None)]
        mentions.append(ner_mention)
        key_columns.append(ner_column)
    
    chunk.insert(2, 'ner_mention', mentions)  # Insert 'mention' column after 'id'
    chunk.insert(3, 'ner_column', key_columns)  # Insert 'key_column_mention' after 'mention'
    processed_chunks.append(chunk)
    progress_bar.update(chunk.shape[0])



# Concatenate all processed chunks to form the final DataFrame
final_df = pd.concat(processed_chunks, ignore_index=True)

# Optionally, save the final DataFrame to a new CSV file
final_df.to_csv('./data/alligator_training_data/Round3_training_data.csv', index=False)

# Close the progress bar
progress_bar.close()

100%|██████████| 3818812/3818812 [04:56<00:00, 14807.09it/s]

# R4

In [5]:
with open('./R4_ner_type.json', 'r') as f:
    key_to_cell = json.load(f)

In [8]:
csv_file = './data/datasets_alligator_completi/Round4.csv'
output_file = './data/alligator_training_data/Round4_training_data.csv'

chunk_size = 10000 

# Create an iterator to read the CSV file in chunks
chunk_iterator = pd.read_csv(csv_file, chunksize=chunk_size)
total_rows = sum(1 for row in open(csv_file)) - 1
progress_bar = tqdm(total=total_rows)
processed_chunks = []
header_written = False

for chunk in chunk_iterator:
    mentions = []
    key_columns = []
    for _, row in chunk.iterrows():
        key = row['key']
        id = row['id']
        key_column = f"{key.split('-')[0]} {key.split('-')[2]}"        
        ner_mention = entity_to_id[id_to_ner.get(id, None)]
        ner_column = entity_to_id[key_to_cell.get(key_column, None)]
        mentions.append(ner_mention)
        key_columns.append(ner_column)
    
    chunk.insert(2, 'ner_mention', mentions)  # Insert 'mention' column after 'id'
    chunk.insert(3, 'ner_column', key_columns)  # Insert 'key_column_mention' after 'mention'
    if not header_written:
        chunk.to_csv(output_file, mode='w', index=False)
        header_written = True
    else:
        chunk.to_csv(output_file, mode='a', index=False, header=False)
    progress_bar.update(chunk.shape[0])



# Concatenate all processed chunks to form the final DataFrame
#final_df = pd.concat(processed_chunks, ignore_index=True)

# Optionally, save the final DataFrame to a new CSV file
#final_df.to_csv('./data/alligator_training_data/Round4_training_data.csv', index=False)

# Close the progress bar
progress_bar.close()

100%|██████████| 9412429/9412429 [26:35<00:00, 5899.71it/s]


# 2T

In [None]:
tables_path = "./data/Dataset/Dataset/2T_Round4/tables/"
cea_file = './data/Dataset/Dataset/2T_Round4/gt/cea.csv'
os.listdir(tables_path)

# Read the cea_file and create a key-value dictionary
df = pd.read_csv(cea_file, header=None)
df["key"] = df[0] + " " + df[1].astype(str) + " " + df[2].astype(str)
cea_values_dict = dict(zip(df["key"].values, df[3].values))
cea_keys_set = set(df["key"].values)

# Function to process a single table file
def process_table_file(table_file):
    try:
        table_name = os.path.splitext(os.path.basename(table_file))[0]
        df = pd.read_csv(table_file)
        local_key_to_cell = {}
        
        for row in range(df.shape[0]):
            for col in range(df.shape[1]):
                key = f"{table_name} {row+1} {col}"
                if key in cea_keys_set:
                    cell_value = df.iloc[row, col]
                    local_key_to_cell[key] = (cell_value, cea_values_dict[key])
                    break  # Exit inner loop early as only one match per row/col is needed
        
        return local_key_to_cell
    except Exception as e:
        logging.error(f"Error processing {table_file}: {e}")
        return {}

# List of table files
table_files = [os.path.join(tables_path, table) for table in os.listdir(tables_path)]

# Process tables sequentially
key_to_cell = {}
for table_file in tqdm(table_files, desc="Processing tables"):
    local_key_to_cell = process_table_file(table_file)
    key_to_cell.update(local_key_to_cell)


In [None]:
csv_file = './data/datasets_alligator_completi/2T-2020.csv'

chunk_size = 10000 

# Create an iterator to read the CSV file in chunks
chunk_iterator = pd.read_csv(csv_file, chunksize=chunk_size)
total_rows = sum(1 for row in open(csv_file)) - 1
progress_bar = tqdm(total=total_rows)
processed_chunks = []

for chunk in chunk_iterator:
    mentions = [] 
    key_columns = []
    for _, row in chunk.iterrows():
        key = row['key']
        id = row['id']
        key_column = f"{key.split('-')[0]} {key.split('-')[2]}"
        ner_mention = entity_to_id[id_to_ner.get(id, None)]
        ner_column = entity_to_id[key_to_cell.get(key_column, None)]
        mentions.append(ner_mention)
        key_columns.append(ner_column)
    
    chunk.insert(2, 'ner_mention', mentions)  # Insert 'mention' column after 'id'
    chunk.insert(3, 'ner_column', key_columns)  # Insert 'key_column_mention' after 'mention'
    processed_chunks.append(chunk)
    progress_bar.update(chunk.shape[0])



# Concatenate all processed chunks to form the final DataFrame
final_df = pd.concat(processed_chunks, ignore_index=True)

# Optionally, save the final DataFrame to a new CSV file
final_df.to_csv('./data/alligator_training_data/2T_2020_training_data.csv', index=False)

# Close the progress bar
progress_bar.close()

# HT2

In [9]:
with open('./HT2_ner_type.json', 'r') as f:
    key_to_cell = json.load(f)

In [10]:
csv_file = './data/datasets_alligator_completi/HardTableR2-2021.csv'
output_file = './data/alligator_training_data/HT2_training_data.csv'

chunk_size = 10000 

# Create an iterator to read the CSV file in chunks
chunk_iterator = pd.read_csv(csv_file, chunksize=chunk_size)
total_rows = sum(1 for row in open(csv_file)) - 1
progress_bar = tqdm(total=total_rows)
processed_chunks = []
header_written = False

for chunk in chunk_iterator:
    mentions = []
    key_columns = []
    for _, row in chunk.iterrows():
        key = row['key']
        id = row['id']
        key_column = f"{key.split('-')[0]} {key.split('-')[2]}"        
        ner_mention = entity_to_id[id_to_ner.get(id, None)]
        ner_column = entity_to_id[key_to_cell.get(key_column, None)]
        mentions.append(ner_mention)
        key_columns.append(ner_column)
    
    chunk.insert(2, 'ner_mention', mentions)  # Insert 'mention' column after 'id'
    chunk.insert(3, 'ner_column', key_columns)  # Insert 'key_column_mention' after 'mention'
    if not header_written:
        chunk.to_csv(output_file, mode='w', index=False)
        header_written = True
    else:
        chunk.to_csv(output_file, mode='a', index=False, header=False)
    progress_bar.update(chunk.shape[0])



# Concatenate all processed chunks to form the final DataFrame
#final_df = pd.concat(processed_chunks, ignore_index=True)

# Optionally, save the final DataFrame to a new CSV file
#final_df.to_csv('./data/alligator_training_data/Round4_training_data.csv', index=False)

# Close the progress bar
progress_bar.close()

100%|██████████| 438801/438801 [01:04<00:00, 6815.35it/s] 


# HT3

In [None]:
with open('./HT3_ner_type.json', 'r') as f:
    key_to_cell = json.load(f)

In [None]:
csv_file = '../data/datasets_alligator_completi/HardTableR3-2021.csv'
output_file = '../data/alligator_training_data/HT3_training_data.csv'

chunk_size = 10000 

# Create an iterator to read the CSV file in chunks
chunk_iterator = pd.read_csv(csv_file, chunksize=chunk_size)
total_rows = sum(1 for row in open(csv_file)) - 1
progress_bar = tqdm(total=total_rows)
processed_chunks = []
header_written = False

for chunk in chunk_iterator:
    mentions = []
    key_columns = []
    for _, row in chunk.iterrows():
        key = row['key']
        id = row['id']
        key_column = f"{key.split('-')[0]} {key.split('-')[2]}"        
        ner_mention = entity_to_id[id_to_ner.get(id, None)]
        ner_column = entity_to_id[key_to_cell.get(key_column, None)]
        mentions.append(ner_mention)
        key_columns.append(ner_column)
    
    chunk.insert(2, 'ner_mention', mentions)  # Insert 'mention' column after 'id'
    chunk.insert(3, 'ner_column', key_columns)  # Insert 'key_column_mention' after 'mention'
    if not header_written:
        chunk.to_csv(output_file, mode='w', index=False)
        header_written = True
    else:
        chunk.to_csv(output_file, mode='a', index=False, header=False)
    progress_bar.update(chunk.shape[0])



# Concatenate all processed chunks to form the final DataFrame
#final_df = pd.concat(processed_chunks, ignore_index=True)

# Optionally, save the final DataFrame to a new CSV file
#final_df.to_csv('./data/alligator_training_data/Round4_training_data.csv', index=False)

# Close the progress bar
progress_bar.close()