In [1]:
import pandas as pd
import json
import csv
import os
import numpy as np
import torch
import openai
import re
from collections import defaultdict
from tqdm import tqdm

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from transformers import BertForTokenClassification,BertTokenizerFast

In [None]:
from utils import convert_to_iob

In [4]:
seed_nr = 42
generator = torch.Generator().manual_seed(seed_nr)

In [5]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

In [7]:
dataset_entities = pd.read_csv("../data/labeled_entities/full_dataset_entities_labeled_dbp_yago_DBP.csv")
dataset_entities.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
def add_mention(ent_text, tab_id, text_lookup, dataset_entities):
    
    row_index = dataset_entities.index[dataset_entities['entity'] == text_lookup]

    if not row_index.empty:
        dataset_entities.loc[row_index, ['mention', 'table_id']] = [ent_text, tab_id]
        
    return dataset_entities

In [11]:
labels_dict = {"None":0,
"Activity": 1,
"Organisation": 2,
"ArchitecturalStructure":3,
"Event":4,
"Place":5,
"Person":6,
"Work":7,
}

In [12]:
dataset_entities['class'].unique()

array(['Activity', 'Organisation', 'ArchitecturalStructure', 'Event',
       'Place', 'Person', 'Work'], dtype=object)

In [13]:
nclass = dataset_entities['class'].to_numpy()
nwikidata_id = dataset_entities['wikidata_id'].to_numpy()
nentity = dataset_entities['entity'].to_numpy()

nlabels= list(zip(nclass,nwikidata_id))

In [16]:
np.where(nentity == "Germany")[0]
nlabels[41937]

('Place', 'Q183')

In [17]:
data_path = "../data/wiki_dbpedia_lvl_1_berttokenizerfast.json"

In [18]:
dataset_df = pd.read_json(data_path,lines=True)

In [19]:
len(dataset_df)

51293

In [20]:
dataset_df.head(2)

Unnamed: 0,_id,numCols,numDataRows,numHeaderRows,numericColumns,order,pgId,pgTitle,sectionTitle,tableCaption,tableData,tableHeaders,tableId,tableCaption_tokenized,tableCaption_input_ids
0,10004068-1,5,9,1,[0],0.448135,10004068,Red Bull BC One,Winners,Winners,"[[{'cellID': -1, 'textTokens': [], 'text': '20...","[[{'cellID': -1, 'textTokens': [], 'text': 'Ye...",1,[winners],[4791]
1,10004122-1,5,3,2,"[1, 4]",0.060823,10004122,Daniel Smith (cricketer),Career Best Performances,Career Best Performances,"[[{'cellID': -1, 'textTokens': [], 'text': 'FC...","[[{'cellID': -1, 'textTokens': [], 'text': '',...",1,"[career, best, performances]","[2476, 2190, 4616]"


In [21]:
len(dataset_df)

51293

In [None]:
all_tables = []
weird_entities = {}
big_tables = 0
error_table = 0
with tqdm(total=51293) as progress:
    with pd.read_json(data_path, lines=True, chunksize=5000) as reader:
        for chunk in reader:
            for index, t in chunk.iterrows():   
    
                t = dataset_df.iloc[index]   

                new_table = []
                tableHeaders_cleaned = []
                tab_id = t["_id"]
                pg_Title = t["pgTitle"]
                section_Title = t["sectionTitle"]
                table_caption = t["tableCaption"]    
                tableHeaders = t["tableHeaders"]    
                table = t["tableData"]                    
                data_rows = [ r for r in table ]

                headers_cells = [h for h in tableHeaders][0]
                for c, head in enumerate(headers_cells):
                    tableHeaders_cleaned.append([[-1,c], head['text']])

                if len(data_rows)<50:

                    table_data = []  
                    row_labels = []
                    for i, row in enumerate(table):                            
                        for j, cell in enumerate(row):

                            per_cell_labels = []
                            cell_spans = []
                            table_data.append([[i,j],cell["text"]])

                           # first adding bio labels - this will also label tokens which don't have surfaceLinks

                            indices = [np.where(nentity == x["target"]["title"])[0] for x in cell['surfaceLinks']]
                            cell["labels"] = [nlabels[i[0]] if i.any() else (None,None) for i in indices]
                            tokens = tokenizer(cell['text'],add_special_tokens=False)
                            text_tokens = tokens.tokens()

                            surfaces = [tokenizer(cell['surfaceLinks'][i]['surface'],add_special_tokens=False).tokens() for i in range(len(cell['surfaceLinks']))]
                            mapped = convert_to_iob(text_tokens, surfaces, cell["labels"])

                            cell["mapped"] = mapped

                           # now adding the span-based labels for the linked entities

                            if len(cell['surfaceLinks']) > 0:
                                for k in range(len(cell['surfaceLinks'])):
                                    subcell_i = cell['surfaceLinks'][k]

                                    start_idx = subcell_i["offset"]
                                    end_idx = subcell_i["endOffset"]
                                    ent_text = subcell_i["surface"]
                                    text_lookup =  subcell_i["target"]["title"]                           
                            
                                    try: 
                                        span_search = re.search(ent_text, cell["text"])
                                        if span_search:
                                            token_start = span_search.span()[0]
                                            token_end = span_search.span()[1]
                                    except:
                                        token_start = start_idx
                                        token_end = end_idx
                                        weird_entities[ent_text] = (text_lookup, tab_id)                                            

                                    lookup_idx = np.where(nentity == text_lookup)[0]
                                    ent_label = nlabels[lookup_idx[0]] if lookup_idx.any() else (None,None)

                                    if ent_label[1]:
                                        add_mention(ent_text, tab_id, text_lookup, dataset_entities)

                                    if ent_label[0] in labels_dict:
                                        span_based_annotation = (i, j, token_start, token_end, labels_dict[ent_label[0]])
                                    else:
                                        span_based_annotation = (i, j, token_start, token_end, 0)           

                                    cell_spans.append(span_based_annotation)


                                cell["subcell_labels"] = cell_spans
                                per_cell_labels.append(cell_spans)
                                per_cell_labels.append(mapped)

                            if len(per_cell_labels)>0:
                                row_labels.append(per_cell_labels)                                    

                    new_table.append([tab_id,
                               pg_Title,
                               section_Title,
                               table_caption,
                               tableHeaders_cleaned,
                               table_data,
                               row_labels]
                               )
                    all_tables.append(new_table)       
                    progress.update()
                else:
                    big_table += 1

with open('../data/tables_NER_final.json', 'w') as f:
    json.dump(all_tables, f)

print(len(all_tables))
print("big_tables: ", big_tables)

 67%|███████████████████████████████████████████████████████████████████████████████████▏                                        | 34409/51293 [3:58:37<1:30:08,  3.12it/s]

In [20]:
len(all_tables)

NameError: name 'all_tables' is not defined

In [None]:
all_tables = []
big_tables = 0
error_table = 0
with tqdm(total=51293) as progress:
    with pd.read_json(data_path, lines=True, chunksize=5000) as reader:
        for chunk in reader:
            for index, t in chunk.iterrows():   
                
                    t = dataset_df.iloc[index]

                    new_table = []
                    tableHeaders_cleaned = []
                    tab_id = t["_id"]
                    pg_Title = t["pgTitle"]
                    section_Title = t["sectionTitle"]
                    table_caption = t["tableCaption"]    
                    tableHeaders = t["tableHeaders"]    
                    table = t["tableData"]                    
                    data_rows = [ r for r in table ]
                    
                    headers_cells = [h for h in tableHeaders][0]
                    for c, head in enumerate(headers_cells):
                        tableHeaders_cleaned.append([[-1,c], head['text']])

                    if len(data_rows)<50:

                        table_data = []  
                        row_labels = []
                        for i, row in enumerate(table):                            
                            for j, cell in enumerate(row):
                                
                                per_cell_labels = []
                                cell_spans = []
                                table_data.append([[i,j],cell["text"]])
                                
                               # first adding bio labels - this will also label tokens which don't have surfaceLinks

                                indices = [np.where(nentity == x["target"]["title"])[0] for x in cell['surfaceLinks']]
                                cell["labels"] = [nlabels[i[0]] if i.any() else (None,None) for i in indices]
                                tokens = tokenizer(cell['text'],add_special_tokens=False)
                                text_tokens = tokens.tokens()

                                surfaces = [tokenizer(cell['surfaceLinks'][i]['surface'],add_special_tokens=False).tokens() for i in range(len(cell['surfaceLinks']))]
                                mapped = convert_to_iob(text_tokens, surfaces, cell["labels"])

                                cell["mapped"] = mapped
                                                                
                               # now adding the span-based labels for the linked entities
                            
                                if len(cell['surfaceLinks']) > 0:
                                    for k in range(len(cell['surfaceLinks'])):
                                        subcell_i = cell['surfaceLinks'][k]

                                        start_idx = subcell_i["offset"]
                                        end_idx = subcell_i["endOffset"]
                                        ent_text = subcell_i["surface"]
                                        text_lookup =  subcell_i["target"]["title"]


                                        lookup_idx = np.where(nentity == text_lookup)[0]
                                        ent_label = nlabels[lookup_idx[0]] if lookup_idx.any() else (None,None)
                                        
                                        if ent_label[1]:
                                            add_mention(ent_text, tab_id, text_lookup, dataset_entities)
                                        
                                        if ent_label[0] in labels_dict:
                                            span_based_annotation = (i, j, start_idx, end_idx, labels_dict[ent_label[0]])
                                        else:
                                            span_based_annotation = (i, j, start_idx, end_idx, 0)           

                                        cell_spans.append(span_based_annotation)
    
                                        
                                    cell["subcell_labels"] = cell_spans

                                    per_cell_labels.append(cell_spans)
                                    per_cell_labels.append(mapped)
                                    #per_cell_BIOlabels.append(mapped)
                                    
                                if len(per_cell_labels)>0:
                                    row_labels.append(per_cell_labels)                                    

                            #print(row_labels)

                        new_table.append([tab_id,
                                   pg_Title,
                                   section_Title,
                                   table_caption,
                                   tableHeaders_cleaned,
                                   table_data,
                                  # header_labels,
                                   row_labels]
                                   )
                        all_tables.append(new_table)
                        progress.update()
                    else:
                        big_table += 1
                        
with open('../data/final_NER_labeled_dataset.json', 'w') as f:
    json.dump(all_tables, f)

print(len(all_tables))
print("big_tables: ", big_tables)

In [27]:
len(all_tables)

51293

In [42]:
dataset_entities

Unnamed: 0,entity,class,wikidata_id,source,mention,table_id
0,Scrabble,Activity,Q170436,dbp,,
1,Spelljammer,Activity,Q1060172,dbp,Spelljammer,16407696-40
2,Monopoly_(game),Activity,Q17243,dbp,Monopoly,19187999-3
3,Cluedo,Activity,Q17245,dbp,Clue,19187999-3
4,Classic_BattleTech,Activity,Q5128099,dbp,Classic BattleTech,21390640-2
...,...,...,...,...,...,...
282316,1978_in_poetry,Work,Q4578439,yago,1978,5531029-2
282317,1977_in_poetry,Work,Q4577920,yago,1977,5531029-2
282318,1976_in_poetry,Work,Q4577415,yago,1976,5531029-2
282319,Os_Lusíadas,Work,Q781898,yago,Os Lusíadas,843347-15


In [43]:
dataset_entities.to_csv("../data/entities_gt.csv")

In [45]:
with open('../data/final_NER_labeled_dataset.csv', 'w') as f:
    json.dump(all_tables, f)

In [32]:
all_tables[200][0]

['10283729-4',
 '2010 IAAF World Indoor Championships',
 'Men',
 'Men',
 [[[-1, 0], 'Event'],
  [[-1, 1], 'Gold'],
  [[-1, 2], 'Gold'],
  [[-1, 3], 'Silver'],
  [[-1, 4], 'Silver'],
  [[-1, 5], 'Bronze'],
  [[-1, 6], 'Bronze']],
 [[[0, 0], '60 m details'],
  [[0, 1], 'Dwain Chambers Great Britain'],
  [[0, 2], '6.48 WL'],
  [[0, 3], 'Mike Rodgers United States'],
  [[0, 4], '6.53'],
  [[0, 5], 'Daniel Bailey Antigua and Barbuda'],
  [[0, 6], '6.57'],
  [[1, 0], '400 m details'],
  [[1, 1], 'Chris Brown Bahamas'],
  [[1, 2], '45.96 SB'],
  [[1, 3], 'William Collazo Cuba'],
  [[1, 4], '46.31 PB'],
  [[1, 5], 'Jamaal Torrance United States'],
  [[1, 6], '46.43'],
  [[2, 0], '800 m details'],
  [[2, 1], 'Abubaker Kaki Sudan'],
  [[2, 2], '1:46.23 SB'],
  [[2, 3], 'Boaz Kiplagat Lalang Kenya'],
  [[2, 4], '1:46.39'],
  [[2, 5], 'Adam Kszczot Poland'],
  [[2, 6], '1:46.69'],
  [[3, 0], '1500 m details'],
  [[3, 1], 'Deresse Mekonnen Ethiopia'],
  [[3, 2], '3:41.86'],
  [[3, 3], 'Abdalaati Ig

In [135]:
with open('ner_annotated_tables.json', 'w') as f:
    json.dump(all_tables, f)

In [20]:
with open('../data/final_NER_labeled_dataset_correct.json', 'r') as f:
    dataset = json.load(f)#[0]

In [222]:
len(dataset)

51293

In [212]:
ex

[['10287348-12',
  'Live Audio Wrestling',
  'End of Year Awards',
  'End of Year Awards',
  [[[-1, 0], 'Year'],
   [[-1, 1], 'Dan Lovranski'],
   [[-1, 2], 'Jason Agnew'],
   [[-1, 3], 'John Pollock'],
   [[-1, 4], 'Wai Ting']],
  [[[0, 0], '2008'],
   [[0, 1], 'Shawn Michaels vs. Ric Flair ( WrestleMania XXIV )'],
   [[0, 2],
    'Samoa Joe vs. Kurt Angle ( TNA Lockdown ) Shawn Michaels vs. Ric Flair ( WrestleMania XXIV ) (honorable mention)'],
   [[0, 3], 'Samoa Joe vs. Kurt Angle ( TNA Lockdown )'],
   [[0, 4], ''],
   [[1, 0], '2009'],
   [[1, 1], 'Edge vs. Jeff Hardy ( WWE Extreme Rules )'],
   [[1, 2],
    'Chris Jericho vs. Rey Mysterio ( WWE The Bash ) Edge vs. Jeff Hardy ( WWE Extreme Rules )'],
   [[1, 3], 'The Undertaker vs. Shawn Michaels ( WrestleMania XXV )'],
   [[1, 4], ''],
   [[2, 0], '2010'],
   [[2, 1], 'The Undertaker vs. Shawn Michaels ( WrestleMania XXVI )'],
   [[2, 2],
    'Daniel Bryan vs. Dolph Ziggler ( WWE Bragging Rights ) The Undertaker vs. Shawn Michael

In [218]:
a = 'Samoa Joe vs. Kurt Angle ( TNA Lockdown ) Shawn Michaels vs. Ric Flair ( WrestleMania XXIV ) (honorable mention)'

In [220]:
span_search = re.search("Ric Flair", a)
if span_search:
    print(span_search.span()[0])
    print(span_search.span()[1])

61
70


In [None]:
# save new span-based per table, per row

In [155]:
example

{(1, 2, 60, 73, 0): (1, 2, 74, 87, 0),
 (4, 2, 171, 182, 6): (4, 2, 205, 216, 6),
 (5, 0, 0, 1, 6): (5, 0, 0, 19, 6),
 (5, 2, 33, 52, 0): (5, 2, 54, 73, 0),
 (5, 2, 52, 67, 0): (5, 2, 85, 100, 0),
 (5, 2, 249, 262, 0): (5, 2, 322, 335, 0),
 (6, 2, 60, 77, 2): (6, 2, 71, 88, 2),
 (10, 2, 30, 47, 1): (10, 2, 143, 160, 1),
 (11, 2, 142, 147, 0): (11, 2, 170, 175, 0),
 (12, 2, 130, 181, 0): (12, 2, 143, 194, 0)}