In [1]:
import json
import math
import os
import random
import re
from collections import Counter
import matplotlib.pyplot as plt
from itertools import chain
from functools import reduce
from collections import Counter
from statistics import mean, median, mode
import numpy as np
import pandas as pd
from utils import loadpkl, flatten_1_deg

In [2]:
OUTPUT_DIR = '../global_data/all_tables'

def read_table(table):
    if table.split('.')[-1] == 'json':
        table = table.split('.')[0]
    with open(os.path.join(OUTPUT_DIR, f"{table}.json"), 'r') as f:
        j = json.load(f)
    return j

# Table stat functions

In [3]:
def table_shape_stats(X):
    t_sh = []
    for table in X:
        t_sh.append(np.array(table).shape[:2])
    print(f"Total shapes: {len(t_sh)}, unqiue: {len(list(set(t_sh)))}\n")

    sh_distr = Counter(t_sh)
    t_s = list(sh_distr.keys())
    t_s_i = list(range(len(t_s)))
    t_s_val = [sh_distr[i] for i in sh_distr.keys()]
    
    sh_distr = sorted(list(zip(t_s,t_s_val)),key=lambda x:x[1],reverse=True)
    print(f"Shape distribution: {sh_distr}\n")
    return t_sh, t_s, t_s_i, t_s_val

def get_avg_table_sh(X):
    t_sh, t_s, t_s_i, t_s_val = table_shape_stats(X)
    r = sum([a*b for a,b in list(zip([x[0] for x in t_s],t_s_val))])/len(t_sh)
    c = sum([a*b for a,b in list(zip([x[1] for x in t_s],t_s_val))])/len(t_sh)
    print(f"shape: {r}  x  {c}")
    
def cell_stats(X):
    all_cells = flatten_1_deg(flatten_1_deg(X.tolist()))
    print(f"Total cells: {len(all_cells)}, unqiue: {len(list(map(list, set(map(lambda i: tuple(i), all_cells)))))}\n")

    all_cells_len = list(map(lambda i: len(i), all_cells))
    cell_len_distr = Counter(all_cells_len)
    cell_len_distr = sorted(cell_len_distr.items(), key=lambda i: i[0])
    c_len,c_len_val = list(zip(*cell_len_distr))

    print(f"cell_len_distr: {cell_len_distr}")
    return all_cells, all_cells_len, c_len, c_len_val, cell_len_distr

# Vocab stats

In [4]:
# all_words = flatten_1_deg(all_cells)
# len(all_words),len(list(set(all_words)))

# String with num stat

In [5]:
# str_w_num = [word for word in all_words if contains_num(word)]
# len(str_w_num)/len(all_words),len(list(set(str_w_num)))/len(list(set(all_words))),

In [6]:
# plt.plot(t_s_i,t_s_val)
# plt.show()
# plt.plot(t_s_i[:300],t_s_val[:300])

In [7]:
# plt.plot(c_len[:100],c_len_val[:100])

# Filter functions for table

In [8]:
def remove_empty_tables(tables):
    e_t = []
    for i in range(len(tables)):
        if np.array(tables[i]).size==0:
            e_t.append(i)
    return np.delete(tables,e_t)  

def remove_empty_cols(table):
    def check_cell_validity(column):
        c = 0
        for i in column:
            if len(i) == 0:
                c += 1
        r = c/len(column)
        if r==1:
            return True
#         elif r >= 0.7 and len(column)>4:
#             return True
        return False
    
    data = np.array(table)
    col = 0
    while(col < data.shape[1]):
        if check_cell_validity(data[:, col]):
            data = np.delete(data, col, 1)
        else:
            col += 1
    return data.tolist()

def remove_empty_rows(table):
    def check_cell_validity(row):
        c = 0
        for i in row:
            if len(i) == 0:
                c += 1
        r = c/len(row)
        if r==1:
            return True
        return False
    
    data = np.array(table)
    row = 0
    while(row < data.shape[0]):
        if check_cell_validity(data[row,:]):
            data = np.delete(data, row, 0)
        else:
            row += 1
    return data.tolist()

def remove_dupl_rows(table):
    t=[]
    for row in table:
        if row not in t:
            t.append(row)
    return t

def remove_dupl_cols(table):
    table = np.array(table)
    if len(table.shape)==3:
        table_t = np.transpose(table,(1,0,2)).tolist()
    elif len(table.shape)==2:
        table_t = np.transpose(table,(1,0)).tolist()
    t=[]
    for row in table_t:
        if row not in t:
            t.append(row)
    if len(table.shape)==3:
        f_table = np.transpose(np.array(t),(1,0,2)).tolist()
    elif len(table.shape)==2:
        f_table = np.transpose(np.array(t),(1,0)).tolist()
    return f_table

def preprocess(X):
    print(X.shape)
    for i in range(len(X)):
        X[i] = remove_empty_cols(X[i])
    print(X.shape)
    X = remove_empty_tables(X)
    print(X.shape)

    for i in range(len(X)):
        X[i] = remove_empty_rows(X[i])
    print(X.shape)
    X = remove_empty_tables(X)
    print(X.shape)

    for i in range(len(X)):
        X[i] = remove_dupl_cols(X[i])
    print(X.shape)
    X = remove_empty_tables(X)
    print(X.shape)

    for i in range(len(X)):
        X[i] = remove_dupl_rows(X[i])
    print(X.shape)
    X = remove_empty_tables(X)
    print(X.shape)
    return X

# Table split functions

In [9]:
def split_data(data):
    data = np.array(data)
    (row_shape, column_shape) = data.shape[:2]

    blocks_per_row = math.ceil(row_shape/MAX_ROW_LEN)
    blocks_per_column = math.ceil(column_shape/MAX_COL_LEN)
    previous_row = 0
    for row_block in range(blocks_per_row):
        previous_row = row_block * MAX_ROW_LEN
        previous_column = 0
        for column_block in range(blocks_per_column):
            previous_column = column_block * MAX_COL_LEN
            block = data[previous_row:previous_row + MAX_ROW_LEN,
                         previous_column:previous_column + MAX_COL_LEN]
            yield block


def split_overflow_table(j):
    X = []
    numDataRows, numCols = np.array(j).shape[:2]
    
    if numCols > MAX_COL_LEN or numDataRows > MAX_ROW_LEN:
        # print('Splitting the data')
        splits = split_data(j)
        for v in splits:
            if v.size != 0:
                # print('Adding split data')
                X.append(v.tolist())
    else:
        X.append(j)

    return X

In [10]:
def remove_1x1_table(X):
    ts_1 = []
    for i in range(len(X)):
        if np.array(X[i]).shape[:2] == (1,1):
            ts_1.append(i)
    return np.delete(X,ts_1)  

# String without Numbers

In [11]:
tables_subset = loadpkl('./data/wo_strnum3.0/postive_tables_set.pkl')
read_all_tables = [read_table(js)['data'] for js in tables_subset]
X = loadpkl('./data/wo_strnum3.0/x_tokenised.pkl')
print(X.shape)

(21320,)


# Removing empty & duplicate rows/cols/tables

In [12]:
X = preprocess(X)

(21320,)
(21320,)
(21320,)
(21320,)
(21320,)
(21320,)
(21320,)
(21320,)
(21320,)


In [13]:
get_avg_table_sh(X)

Total shapes: 21320, unqiue: 855

Shape distribution: [((1, 1), 1654), ((1, 3), 1255), ((2, 1), 1170), ((1, 2), 788), ((2, 2), 678), ((2, 3), 603), ((3, 1), 537), ((4, 2), 520), ((3, 2), 513), ((4, 1), 489), ((3, 3), 475), ((5, 1), 374), ((5, 2), 364), ((4, 3), 351), ((6, 2), 350), ((5, 3), 339), ((8, 2), 284), ((10, 2), 278), ((6, 3), 270), ((6, 1), 250), ((7, 2), 225), ((8, 1), 218), ((7, 3), 209), ((3, 4), 206), ((8, 3), 199), ((2, 4), 195), ((1, 4), 183), ((10, 3), 181), ((9, 2), 177), ((10, 1), 175), ((12, 2), 167), ((4, 4), 161), ((7, 1), 161), ((9, 3), 153), ((5, 4), 144), ((12, 3), 143), ((6, 4), 138), ((11, 2), 129), ((9, 1), 121), ((11, 3), 109), ((13, 2), 106), ((16, 2), 106), ((15, 2), 95), ((13, 3), 95), ((11, 4), 93), ((8, 4), 92), ((7, 4), 91), ((14, 2), 88), ((11, 1), 86), ((10, 4), 86), ((12, 1), 85), ((9, 4), 85), ((15, 3), 81), ((16, 3), 76), ((14, 1), 73), ((14, 3), 68), ((18, 2), 67), ((12, 4), 66), ((15, 4), 64), ((13, 4), 63), ((5, 5), 63), ((17, 2), 59), ((4, 5)

In [14]:
MAX_ROW_LEN = 11
MAX_COL_LEN = 3

# Splitting into smaller blocks

In [15]:
print(X.shape)
X = [split_overflow_table(table) for table in X.tolist()]
X = flatten_1_deg(X)
X = np.array(X)
print(X.shape)

X = preprocess(X)

(21320,)
(46158,)
(46158,)
(46158,)
(45279,)
(45279,)
(45279,)
(45279,)
(45279,)
(45279,)
(45279,)


In [16]:
get_avg_table_sh(X)

Total shapes: 45279, unqiue: 33

Shape distribution: [((11, 3), 6771), ((11, 2), 4744), ((1, 1), 3615), ((11, 1), 2449), ((2, 1), 2283), ((1, 3), 1926), ((2, 3), 1383), ((2, 2), 1344), ((3, 1), 1327), ((3, 3), 1313), ((1, 2), 1301), ((4, 3), 1187), ((4, 1), 1114), ((5, 3), 1072), ((4, 2), 1013), ((3, 2), 997), ((6, 3), 980), ((6, 2), 864), ((5, 1), 851), ((10, 3), 841), ((7, 3), 795), ((5, 2), 764), ((8, 3), 693), ((6, 1), 692), ((9, 3), 682), ((10, 2), 663), ((8, 2), 588), ((7, 2), 545), ((7, 1), 516), ((10, 1), 506), ((9, 2), 498), ((8, 1), 491), ((9, 1), 471)]

shape: 6.303385675478698  x  2.073499856445593


In [17]:
print(X.shape)
X = remove_1x1_table(X)
print(X.shape)

(45279,)
(41664,)


In [18]:
get_avg_table_sh(X)

Total shapes: 41664, unqiue: 32

Shape distribution: [((11, 3), 6771), ((11, 2), 4744), ((11, 1), 2449), ((2, 1), 2283), ((1, 3), 1926), ((2, 3), 1383), ((2, 2), 1344), ((3, 1), 1327), ((3, 3), 1313), ((1, 2), 1301), ((4, 3), 1187), ((4, 1), 1114), ((5, 3), 1072), ((4, 2), 1013), ((3, 2), 997), ((6, 3), 980), ((6, 2), 864), ((5, 1), 851), ((10, 3), 841), ((7, 3), 795), ((5, 2), 764), ((8, 3), 693), ((6, 1), 692), ((9, 3), 682), ((10, 2), 663), ((8, 2), 588), ((7, 2), 545), ((7, 1), 516), ((10, 1), 506), ((9, 2), 498), ((8, 1), 491), ((9, 1), 471)]

shape: 6.763536866359447  x  2.1666426651305684


In [19]:
all_cells,_,_,_,_ = cell_stats(X)

Total cells: 623161, unqiue: 261249

cell_len_distr: [(0, 69979), (1, 364392), (2, 110066), (3, 33259), (4, 14413), (5, 5802), (6, 3861), (7, 3215), (8, 2001), (9, 1504), (10, 1340), (11, 1206), (12, 1009), (13, 728), (14, 683), (15, 2142), (16, 570), (17, 439), (18, 391), (19, 331), (20, 363), (21, 291), (22, 341), (23, 258), (24, 236), (25, 243), (26, 202), (27, 234), (28, 218), (29, 191), (30, 199), (31, 178), (32, 145), (33, 146), (34, 89), (35, 125), (36, 148), (37, 99), (38, 109), (39, 104), (40, 116), (41, 113), (42, 83), (43, 112), (44, 84), (45, 85), (46, 74), (47, 74), (48, 65), (49, 40), (50, 49), (51, 44), (52, 58), (53, 36), (54, 33), (55, 47), (56, 23), (57, 40), (58, 18), (59, 34), (60, 44), (61, 44), (62, 19), (63, 8), (64, 35), (65, 16), (66, 26), (67, 10), (68, 29), (69, 16), (70, 9), (71, 23), (72, 9), (73, 15), (74, 2), (75, 25), (76, 5), (77, 3), (78, 18), (79, 2), (80, 24), (81, 18), (82, 15), (83, 12), (84, 15), (85, 7), (86, 7), (89, 5), (90, 5), (91, 5), (92, 7

In [20]:
all_words = flatten_1_deg(all_cells)
len(all_words),len(list(set(all_words)))

(1193359, 225994)

In [21]:
for i in X:
    if np.array(i).shape[:2]==(1,1):
        print(i)

In [None]:
def pad_table(table):
    rows = len(table)
    for row in table:
        for cell in row:
            if len(cell)==0:
                cell.append('<PAD>')
        for j in range(0, MAX_COL_LEN-len(row)):
            row.append(['<PAD>'])
    for i in range(0, MAX_ROW_LEN-rows):
        table.append([['<PAD>']]
                     * MAX_COL_LEN)
    return table

In [None]:
X[21310]

In [None]:
pad_table(X[21310])