### In this notebook we extract complex tables from the WikiTables corpus. 

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
import numpy as np
import json
import spacy
import tqdm
import csv
import os

In [2]:
# These are the tables in the WikiTables dataset. 
# It can be downloaded from here http://websail-fe.cs.northwestern.edu/TabEL/
original_wikitables = "../data/WikiTables/tables.json"

In [3]:
# helper function to identify subject column
def allUnique(x):
    seen = set()
    return not any(i in seen or seen.add(i) for i in x)

In [4]:
f = open("../data/wiki_tabner_original.json", mode="a")

with tqdm.tqdm() as progress:
    with pd.read_json("../data/WikiTables/tables.json", lines=True, chunksize=200) as reader:
            for chunk in reader:
                for index, t in chunk.iterrows():

                    #initial table filter: more than 2 columns and more than 3 rows
                    if int(t["numCols"])>=2 and int(t["numDataRows"])>=3:

                        numcols = t['numCols']
                        table = t["tableData"]
                        headers = t["tableHeaders"]
                        data_rows = [ r for r in table ]                

                        max_subject_index = 1
                        subject_col=None
                        for col_index in range(numcols):

                            cell_texts=[]
                            subjects = []
                            subjects=[row[col_index]["text"] for row in data_rows]
                            cells = [row[col_index] for row in data_rows ] 
                            total_num_cell=len(cells)  

                            if subject_col != None: 
                                linkspercell = []  
                                linked_cells = 0   
                                for cell in cells:
                                    soup = BeautifulSoup(cell["tdHtmlString"], "html.parser")
                                    atags = soup.find_all("a", href=True)
                                    linkspercell.append(len(atags))
                                    if len(atags) >= 1:
                                        linked_cells+=1
                                perc_linked_cells = linked_cells/total_num_cell
                                avg_links = np.mean(linkspercell)
                            
                            
                                # add table if it satisfy all conditions                           
                                if perc_linked_cells >= 0.9 and avg_links >= 2.0:

                                    df_to_write = pd.DataFrame(columns=t.keys())
                                    df_to_write.loc[0]=t

                                    f.write(df_to_write.to_json(orient="records", lines=True))
                                    progress.update()
                                    break


                            if allUnique(subjects):
                                subject_col = col_index
                                # continue considering this table

                            #if there is no subject column in the first two columns break
                            if col_index == max_subject_index and subject_col==None:
                                break
f.close()

68939it [48:33, 23.66it/s] 


In [5]:
dfs = pd.read_json("../data/wiki_tabner_original.json",lines=True)

In [6]:
len(dfs)

68939

In [13]:
overall_num_rows = []
overall_num_cols = []
linkspercell = []
percent = []
no_cells = []

for index, df in dfs_clean.iterrows():
    table = df["tableData"]
    rows = [ r for r in table ]
    cells = [cell for row in rows for cell in row]

    total_num_cell = len(cells)
    num_rows = len(rows)
    num_cols = df["numCols"]

    overall_num_rows.append(num_rows)
    overall_num_cols.append(num_cols)

    linked_cells = 0
    for cell in cells:
        if len(cell["surfaceLinks"])>0:
            linkspercell.append(len(cell["surfaceLinks"]))
            linked_cells+=1
       # print("Total cells, linked cells", total_num_cell, linked_cells)
    perc_linked_cells = linked_cells/total_num_cell
    percent.append(perc_linked_cells)

In [14]:
# Extracted TabNER tables
(np.mean(overall_num_cols), np.mean(overall_num_rows), np.mean(linkspercell), np.std(linkspercell))

(5.19524930725738, 12.598193263178127, 2.0109193481486645, 2.1236033779412002)

Filtering tables with names such as Exteranl Links. These are tables with densly linked cells, however these links do not refer to entities.

In [9]:
illegal_tables=["External links","References","","sources"]

f = open("../data/wiki_tabner_original_clean.json", mode="a")

with tqdm.tqdm() as progress:
    with pd.read_json("../data/wiki_tabner_original.json", lines=True, chunksize=1000) as reader:
                for chunk in reader:
                    for index, t in chunk.iterrows():
                    
                    # add table if not illegal caption  
                        if t["tableCaption"] not in illegal_tables:

                            df_to_write = pd.DataFrame(columns=t.keys())
                            df_to_write.loc[0]=t
                            
                            f.write(df_to_write.to_json(orient="records", lines=True))
                            progress.update()
f.close()

62433it [01:15, 824.28it/s] 


In [10]:
dfs_clean = pd.read_json("../data/wiki_tabner_original_clean.json",lines=True)

In [11]:
len(dfs_clean)

62433

In [12]:
dfs_clean.head(3)

Unnamed: 0,_id,numCols,numDataRows,numHeaderRows,numericColumns,order,pgId,pgTitle,sectionTitle,tableCaption,tableData,tableHeaders,tableId
0,10003473-2,5,25,1,[0],0.120435,10003473,Memphis Tigers men's basketball,NCAA Tournament Results,NCAA Tournament Results,"[[{'cellID': -1, 'textTokens': [], 'text': '19...","[[{'cellID': -1, 'textTokens': [], 'text': 'Ye...",2
1,100040-4,3,10,1,[],0.680097,100040,Richmond Football Club,"""100 Tiger Treasures""","""100 Tiger Treasures""","[[{'cellID': -1, 'textTokens': [], 'text': 'Be...","[[{'cellID': -1, 'textTokens': [], 'text': 'Aw...",4
2,10004068-1,5,9,1,[0],0.448135,10004068,Red Bull BC One,Winners,Winners,"[[{'cellID': -1, 'textTokens': [], 'text': '20...","[[{'cellID': -1, 'textTokens': [], 'text': 'Ye...",1
