### In this notebook, we itterate over the extracted tables, we transform their format and we add the span-based labels for the linked entities

In [5]:
import pandas as pd
import json
import csv
import os
import numpy as np
import re
import torch
from tqdm import tqdm

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
seed_nr = 42
generator = torch.Generator().manual_seed(seed_nr)

In [8]:
# These are the labeled entities which we extract in notebook 2.
dataset_entities = pd.read_csv("../data/labeled_entities/full_dataset_entities_labeled_dbp_yago_final.csv")
dataset_entities.drop('Unnamed: 0', axis=1, inplace=True)
len(dataset_entities)

408781

In [9]:
entity_index = {entity: index for index, entity in dataset_entities['entity'].items()}

In [10]:
def add_mention(ent_text, tab_id, text_lookup, dataset_entities, entity_idex):
    # For every entity, check if already added. If yes, we append the table id and the entity mention.
    
    row_index = entity_idex[text_lookup]
    
    current_mention = dataset_entities.at[row_index, 'mention']
    current_table_id = dataset_entities.at[row_index, 'table_id']

    # Update mention if different and non-null
    if pd.isna(current_mention):
        new_mention = ent_text
    else:
        new_mention = current_mention if ent_text == current_mention else f"{current_mention},{ent_text}" if ent_text else current_mention

    # Update table_id if different and non-null
    if pd.isna(current_table_id):
        new_table_id = tab_id
    else:
        new_table_id = current_table_id if tab_id == current_table_id else f"{current_table_id},{tab_id}" if tab_id else current_table_id

    # Assign the new values back to the DataFrame
    dataset_entities.at[row_index, 'mention'] = new_mention
    dataset_entities.at[row_index, 'table_id'] = new_table_id
        
    return dataset_entities

In [11]:
labels_dict = {"None":0,
"Activity": 1,
"Organisation": 2,
"ArchitecturalStructure":3,
"Event":4,
"Place":5,
"Person":6,
"Work":7,
}

In [12]:
dataset_entities['class'].unique()

array(['Activity', 'Organisation', 'ArchitecturalStructure', 'Event',
       'Place', 'Person', 'Work'], dtype=object)

In [13]:
nclass = dataset_entities['class'].to_numpy()
nwikidata_id = dataset_entities['wikidata_id'].to_numpy()
nentity = dataset_entities['entity'].to_numpy()

nlabels= list(zip(nclass,nwikidata_id))

In [14]:
# example for the lookup for the entity
np.where(nentity == "Germany")[0]
nlabels[41937]

('ArchitecturalStructure', 'Q2399023')

In [15]:
# load the extracted tables
data_path = "../data/wiki_tabner_original_clean.json"
dataset_df = pd.read_json(data_path,lines=True)
len(dataset_df)

62433

In [19]:
dataset_df.head(2)

Unnamed: 0,_id,numCols,numDataRows,numHeaderRows,numericColumns,order,pgId,pgTitle,sectionTitle,tableCaption,tableData,tableHeaders,tableId
0,10003473-2,5,25,1,[0],0.120435,10003473,Memphis Tigers men's basketball,NCAA Tournament Results,NCAA Tournament Results,"[[{'cellID': -1, 'textTokens': [], 'text': '19...","[[{'cellID': -1, 'textTokens': [], 'text': 'Ye...",2
1,100040-4,3,10,1,[],0.680097,100040,Richmond Football Club,"""100 Tiger Treasures""","""100 Tiger Treasures""","[[{'cellID': -1, 'textTokens': [], 'text': 'Be...","[[{'cellID': -1, 'textTokens': [], 'text': 'Aw...",4


In [None]:
all_tables = []
weird_entities = {}
big_tables = 0
error_table = 0
with tqdm(total=7066) as progress:
    with pd.read_json(data_path, lines=True, chunksize=5000) as reader:
        for chunk in reader:
            for index, t in chunk.iterrows():   
                
                t = dataset_df.iloc[index]   

                new_table = []
                tableHeaders_cleaned = []
                tab_id = t["_id"]
                
                pg_Title = t["pgTitle"]
                section_Title = t["sectionTitle"]
                table_caption = t["tableCaption"]    
                tableHeaders = t["tableHeaders"]    
                table = t["tableData"]                    
                data_rows = [ r for r in table ]

                headers_cells = [h for h in tableHeaders][0]
                for c, head in enumerate(headers_cells):
                    tableHeaders_cleaned.append([[-1,c], head['text']])

                    table_data = []  
                    row_labels = []
                    for i, row in enumerate(table):                            
                        for j, cell in enumerate(row):

                            per_cell_labels = []
                            cell_spans = []
                            table_data.append([[i,j],cell["text"]])
                            if len(cell['surfaceLinks']) > 0:
                                for k in range(len(cell['surfaceLinks'])):
                                    subcell_i = cell['surfaceLinks'][k]

                                    start_idx = subcell_i["offset"]
                                    end_idx = subcell_i["endOffset"]
                                    ent_text = subcell_i["surface"]
                                    text_lookup =  subcell_i["target"]["title"]                           
                            
                                    try: 
                                        span_search = re.search(ent_text, cell["text"])
                                        if span_search:
                                            token_start = span_search.span()[0]
                                            token_end = span_search.span()[1]
                                    except:
                                        token_start = start_idx
                                        token_end = end_idx
                                        weird_entities[ent_text] = (text_lookup, tab_id)                                            

                                    lookup_idx = np.where(nentity == text_lookup)[0]
                                    ent_label = nlabels[lookup_idx[0]] if lookup_idx.any() else (None,None)

                                    if ent_label[1]:
                                        add_mention(ent_text, tab_id, text_lookup, dataset_entities, entity_index)

                                    if ent_label[0] in labels_dict:
                                        span_based_annotation = (i, j, token_start, token_end, labels_dict[ent_label[0]])
                                    else:
                                        span_based_annotation = (i, j, token_start, token_end, 0)           

                                    cell_spans.append(span_based_annotation)


                                cell["subcell_labels"] = cell_spans
                                per_cell_labels.append(cell_spans)
                               # per_cell_labels.append(mapped)

                            if len(per_cell_labels)>0:
                                row_labels.append(per_cell_labels)                                    

                new_table.append([tab_id,
                           pg_Title,
                           section_Title,
                           table_caption,
                           tableHeaders_cleaned,
                           table_data,
                           row_labels]
                           )
                all_tables.append(new_table)       
                progress.update()              

with open('../data/Wiki_TabNER_final_batch.json', 'w') as f:
    json.dump(all_tables, f)

print(len(all_tables))

In [43]:
dataset_entities.to_csv("../data/labeled_entities/dataset_entities_labeled_linked.csv")

In [21]:
dfs = []
tab_ids = []
for i in range(len(all_tables)):
    
    tab_ids.append(all_tables[i][0][0])
    
    tableHeaders = all_tables[i][0][4]
    columns = [tableHeaders[i] for i in range(len(tableHeaders))]
    
    table_data = all_tables[i][0][5]
    

    row_indexes = [item[0][0] for item in table_data]
    col_indexes = [item[0][1] for item in table_data]
    values = [item[1] for item in table_data]

    # Create a dictionary to hold the data
    data_dict = {}
    for row_idx, col_idx, value in zip(row_indexes, col_indexes, values):
        if row_idx not in data_dict:
            data_dict[row_idx] = {}
        data_dict[row_idx][col_idx] = value

    df = pd.DataFrame.from_dict(data_dict, orient='index')

    # Remove the index name
    df.index.name = None
    dfs.append(df)