In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
import numpy as np
import json
import spacy
import tqdm
import csv
import os
from scipy.stats import kurtosis

In [2]:
import glob

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
cell = 'Unforgiven – Clint Eastwood A Few Good Men – Rob Reiner , Andrew Scheinman Howards End – Ismail Merchant Scent of a Woman – Martin Brest The Crying Game – Stephen Woolley'

In [5]:
nlp(cell).ents

(Clint Eastwood A Few Good Men,
 Rob Reiner,
 Andrew Scheinman Howards End,
 Stephen Woolley)

In [6]:
len([tok for tok in nlp(str(cell))])

32

### Wikitables analysis

We calculate the number of links per cell in the tables. We want to extract how many of the total number of cells in the WikiTables corpus, have 2 or more links.

In [None]:
original_wikitables = "../data/original_dataset/tables.json"
wiki_links_ids = "../data/original_dataset/wiki_links-random.txt"

In [None]:
# Function to analyze entity counts in a batch
def analyze_batch(batch_data):
    entity_counts = []    
    
    for table in batch_data:
        table_data = table['tableData']
       
        for row in table_data:
            for cell in row:       
                if len(cell["surfaceLinks"])>0:
                    entity_counts.append(len(cell["surfaceLinks"]))
                else:
                    entity_counts.append(0)
                    
    return pd.Series(entity_counts)

In [None]:
with open("../data/original_dataset/tables.json", "r") as file:

    batch_size = 1500000  
    batch_data = [json.loads(next(file)) for _ in range(batch_size)]
    total_entity_counts = pd.Series()

    batch_entity_counts = analyze_batch(batch_data)
    total_entity_counts = pd.concat([total_entity_counts, batch_entity_counts])
    
total_entity_counts.to_csv("wiki_tables_entity_count.csv")
stats_all_batches = total_entity_counts.describe()

print("Summary Statistics on Entity Counts for All Batches:")
print(stats_all_batches)

In [None]:
total_entity_counts = pd.read_csv("wiki_tables_entity_count.csv")

In [None]:
len(total_entity_counts)

In [None]:
column_names = ["cell", "num_ent"]
total_entity_counts.columns = column_names
del total_entity_counts["cell"]

In [None]:
no_zeros=total_entity_counts.loc[total_entity_counts.num_ent>0]

In [None]:
len(no_zeros) # 25 percent of all cells have at least 1 link, 4% of all have at least 2

In [None]:
no_zeros['num_ent'].describe()

In [None]:
more_than_1=no_zeros.loc[no_zeros.num_ent>2]

In [None]:
len(more_than_1) # 3.7 percent of all cells have more than 2 links per cell.

### Other datasets analysis

The T2D, Limaye and WikiGS datasets are downloaded from http://www.cs.toronto.edu/~oktie/webtables/

In [None]:
t2d_path = "../Efthymiou_datasets/webtables-evaluation-data/csv/t2d_tables_instance/"
limaye_path = "../Efthymiou_datasets/webtables-evaluation-data/csv/LimayeGS/tables_instance/"
wiki_gold_standard_path = "../Efthymiou_datasets/webtables-evaluation-data/csv/WikipediaGS/tables_instance/csv/"

In [None]:
git_path = "../data/other_data/Git_tables/"

In [None]:
sato_path = "../data/other_data/sato_tables/all/K0/" #itterate here over the K splits and calc avg stats

In [None]:
dfs_wiki = load_tables(wiki_gold_standard_path)
dfs_wiki = dfs_wiki[100000:115000]

In [19]:
def load_tables(folder_path):
    dfs = []
    csv_files = [f for f in os.listdir(folder_path) ]
    for csv_file in csv_files:
        file_path = os.path.join(folder_path, csv_file)
        try:
            df = pd.read_csv(file_path, engine='python', on_bad_lines='skip')
            dfs.append(df)
        except pd.errors.EmptyDataError as e:
                print(f"No data found in CSV file: {csv_file}")
    return dfs

In [20]:
def compute_stats(dfs):
    
    stats = {"mean_num_rows":0, "std_num_rows":0, "mean_num_cols":0, "std_num_cols":0, "mean_num_toks":0, "std_num_toks":0, "kurtosis":0}
    overall_num_rows = []
    overall_num_cols = []
    overall_num_toks = []
    overall_info_dens = []
    overall_num_ents_spacy = []
    num_ents_comma = []
    
    for df in dfs:
        
        named_ents = named_entities_cell[i]
        
        num_cols = len(df.columns)
        num_rows = len(df)

        overall_num_rows.append(num_rows)
        overall_num_cols.append(num_cols)

        for index,row in df.iterrows():
            for col in df.columns:  
             
                cell_text = row[col]
                
               # num_ents_spacy = len(nlp(str(cell_text)).ents)
               # overall_num_ents_spacy.append(num_ents_spacy)
                
                ents_comma = str(cell_text).split(",")
                num_ents_comma.append(len(ents_comma))

                num_tokens = len([tok for tok in nlp(str(cell_text))])            
                overall_num_toks.append(num_tokens)
                
    stats["mean_num_rows"] = np.mean(overall_num_rows)
    stats["std_num_rows"] = np.std(overall_num_rows)

    stats["mean_num_cols"] = np.mean(overall_num_cols)
    stats["std_num_cols"] = np.std(overall_num_cols)
    
    stats["mean_num_toks"] = np.mean(overall_num_toks)
    stats["std_num_toks"] = np.std(overall_num_toks)

    stats["mean_comma"] = np.mean(num_ents_comma)
    stats["std_comma"] = np.std(num_ents_comma)
        
    return stats


In [21]:
def add_stats(dataset_name, file_path, final_stats_df):    
        
    #dfs = load_tables(file_path)    
    print(len(dfs))
    stats = compute_stats(dfs)
    
    stats_df =  pd.DataFrame([stats])
    stats_df["dataset"] = dataset_name
    stats_df["num_tables"] = len(dfs)
    
    df_final = pd.concat([final_stats_df, stats_df], ignore_index=True)
    
    df_final.to_csv("overall_dataset_stats.csv")
    
    return df_final

In [22]:
# load current stats to add new 
current_stats = pd.read_csv("../../tabner/notebooks/overall_dataset_stats.csv")

In [None]:
added_wikiGS = add_stats("WikiGS", wiki_gold_standard_path, added_sato0)

In [None]:
# Computing the mean over the data split from SATO

only_sato = added_sato0[added_sato0['dataset'].str.contains("sato")]
mean_row = only_sato[numeric_columns].mean(axis=0)
mean_row['num_tables'] = only_sato['num_tables'].sum(axis=0)
mean_row['dataset'] = "sato_multi"

In [None]:
added_sato0.loc[12] = mean_row

### TURL entity linking eval dataset

The link for donloading the TURL data can be found in their github repository: https://github.com/sunlab-osu/TURL

In [None]:
# similar for the test_own.table_entity_linking.json
with open(os.path.join('../turl_dataset', 'dev.table_entity_linking.json'), 'r') as f:
    turl_dataset = json.load(f)

In [None]:
added_turl = add_stats("turl_el_dev", turl_dataset, current_stats)

In [None]:
turl_el = added_turl.loc[added_turl['dataset']=="turl_el"]
turl_el_dev= added_turl.loc[added_turl['dataset']=="turl_el_dev"]

In [None]:
# Computing the mean over the dev and test data from TURL
merged_df = pd.concat([turl_el, turl_el_dev])
mean_row = merged_df[numeric_columns].mean(axis=0)

turl_mean = mean_row.to_dict()
turl_mean['num_tables'] = 10927
turl_mean['dataset'] = "turl_el_test_dev"

In [None]:
added_turl.loc[7] = turl_mean

In [None]:
added_turl.to_csv("overall_dataset_stats.csv")

### Wiki_TabNER analysis

Analysis of the initially extracted tables, without annotations

In [8]:
tabner_data_path="../data/Wiki_TabNER_final_labeled.json"
with open(tabner_data_path, 'r') as f:
    ner_tables = json.load(f)  

In [9]:
dfs = []
named_entities_cell = []
for i in range(len(ner_tables)):
  
    tableHeaders = ner_tables[i][0][4]
    table_data = ner_tables[i][0][5]    
    
    named_entities_cell.append(ner_tables[i][0][6])
    
    columns = [tableHeaders[i] for i in range(len(tableHeaders))]
 
    row_indexes = [item[0][0] for item in table_data]
    col_indexes = [item[0][1] for item in table_data]
    values = [item[1] for item in table_data]

    # Create a dictionary to hold the data
    data_dict = {}
    for row_idx, col_idx, value in zip(row_indexes, col_indexes, values):
        if row_idx not in data_dict:
            data_dict[row_idx] = {}
        data_dict[row_idx][col_idx] = value

    df = pd.DataFrame.from_dict(data_dict, orient='index')
   
    df.index.name = None
    dfs.append(df)

In [18]:
dfs[1]

Unnamed: 0,0,1,2
0,Best Individual Performance of the Century,"Kevin Bartlett ""Put his unique stamp on the 19...",Jack Titus Doug Strang Jack Dyer Roy Wright To...
1,Class of the Century,"Royce Hart ""Thrilled Tiger fans for a decade w...",Vic Thorp Bill Morris Ian Stewart Kevin Bartle...
2,The Strong & the Bold,"Jack Dyer ""No player in the history of the gam...",Basil McCormack Percy Bentley Max Oppy Roger D...
3,Defining Moment,"Save Our Skins ""On 15 August 1990, Richmond an...",Joining The VFL The Sash First Premiership Eat...
4,Servant of the Century,"Graeme Richmond ""Graeme Richmond filled a vari...",Charlie Callander Charlie Priestley Ray Dunn A...
5,Brave Act of the Century,"Francis Bourke ""Bourke collided with teammate ...",Bill Burns George Smeaton Eric Moore Francis B...
6,Premiership of the Century,"1967 ""Richmond, under coach Tommy Hafey , fini...",1920 1921 1932 1934 1943 1969 1973 1974 1980
7,Mark of the Century,"Michael Roach ""The superstar full-forward was ...",Thomas O'Halloran Royce Hart Malcolm Greenslad...
8,Goal of the Century,"Michael Mitchell ""The little Tiger excitement ...",John Ronaldson Bill Barrot Michael Roach Kevin...
9,Controversy of the Century,"Windy Hill Brawl ""On 18 May 1974, all hell bro...",Dean / Barassi Incident In 1963 Crowe / Nichol...


In [10]:
len(dfs)

61273

In [24]:
added_tabner = add_stats("wiki_tabner_final", tabner_data_path, current_stats)

61273


In [25]:
added_tabner

Unnamed: 0.1,Unnamed: 0,dataset,num_tables,mean_num_rows,std_num_rows,mean_num_cols,std_num_cols,mean_num_toks,std_num_toks,mean_num_ents,std_num_ents,mean_comma,std_comma,kurtosis
0,0.0,t2d,233,121.600858,115.509022,4.948498,1.793386,1.913527,1.791974,0.574758,0.539187,1.0,0.0,137.09515
1,1.0,limaye,428,34.549065,40.992761,3.787383,1.233917,1.706675,1.392773,0.322221,0.519874,1.033316,0.18275,12.628752
2,2.0,turl_el,7291,19.883281,14.243179,3.022631,1.113579,1.498448,0.901519,0.406849,0.525828,1.00815,0.090829,18.005219
3,3.0,git_tables,1101,58.196185,94.693164,16.873751,11.60074,1.868231,10.062218,0.760864,1.124126,1.037563,0.617178,12323.953373
4,4.0,sato_K1,15754,18.48064,75.620203,1.531801,0.785103,2.995293,8.307041,0.81391,1.053899,1.127742,0.741978,2054.619908
5,5.0,turl_el_dev,3636,15.117437,13.106033,2.679868,1.014657,1.587599,1.016828,,,1.0092,0.097303,11.624344
6,6.0,tabner_final,61235,12.698865,19.897576,5.206712,2.428827,4.607849,16.135489,,,1.247704,1.37209,0.0
7,7.0,turl_el_test_dev,10927,17.500359,13.674606,2.851249,1.064118,1.543024,0.959173,0.406849,0.525828,1.008675,0.094066,14.814781
8,8.0,sato_K3,15714,18.481991,77.590398,1.531437,0.801749,3.033713,8.182333,,,1.127927,0.984825,0.0
9,9.0,sato_K4,15753,17.897162,55.324801,1.53412,0.783969,2.924267,7.192327,,,1.117836,0.568125,0.0


In [23]:
current_stats

Unnamed: 0.1,Unnamed: 0,dataset,num_tables,mean_num_rows,std_num_rows,mean_num_cols,std_num_cols,mean_num_toks,std_num_toks,mean_num_ents,std_num_ents,mean_comma,std_comma,kurtosis
0,0,t2d,233,121.600858,115.509022,4.948498,1.793386,1.913527,1.791974,0.574758,0.539187,1.0,0.0,137.09515
1,1,limaye,428,34.549065,40.992761,3.787383,1.233917,1.706675,1.392773,0.322221,0.519874,1.033316,0.18275,12.628752
2,2,turl_el,7291,19.883281,14.243179,3.022631,1.113579,1.498448,0.901519,0.406849,0.525828,1.00815,0.090829,18.005219
3,3,git_tables,1101,58.196185,94.693164,16.873751,11.60074,1.868231,10.062218,0.760864,1.124126,1.037563,0.617178,12323.953373
4,4,sato_K1,15754,18.48064,75.620203,1.531801,0.785103,2.995293,8.307041,0.81391,1.053899,1.127742,0.741978,2054.619908
5,5,turl_el_dev,3636,15.117437,13.106033,2.679868,1.014657,1.587599,1.016828,,,1.0092,0.097303,11.624344
6,6,tabner_final,61235,12.698865,19.897576,5.206712,2.428827,4.607849,16.135489,,,1.247704,1.37209,0.0
7,7,turl_el_test_dev,10927,17.500359,13.674606,2.851249,1.064118,1.543024,0.959173,0.406849,0.525828,1.008675,0.094066,14.814781
8,8,sato_K3,15714,18.481991,77.590398,1.531437,0.801749,3.033713,8.182333,,,1.127927,0.984825,0.0
9,9,sato_K4,15753,17.897162,55.324801,1.53412,0.783969,2.924267,7.192327,,,1.117836,0.568125,0.0
