In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import math
import os
import random
import re
from collections import Counter, OrderedDict
import matplotlib.pyplot as plt
from itertools import chain
from functools import reduce
from statistics import mean, median, mode
import numpy as np
import pandas as pd
from multiprocessing import Pool
import re

from utils import loadpkl, flatten_1_deg, savepkl
from new_preprocess import tokenize_str, tokenize_table,read_table
from trec import mp

In [3]:
import spacy

nlp = spacy.load("en_core_web_md")
nlp.add_pipe(nlp.create_pipe("merge_entities"))
nlp.add_pipe(nlp.create_pipe("merge_noun_chunks"))

In [None]:
['i a','j a','a a']

In [108]:
[i for i in nlp('I am vibhav')]

[I, am, vibhav]

In [5]:
def table_shape_stats(X):
    t_sh = []
    for table in X:
        t_sh.append(np.array(table).shape[:2])
    print(f"Total shapes: {len(t_sh)}, unqiue: {len(list(set(t_sh)))}\n")

    sh_distr = Counter(t_sh)
    t_s = list(sh_distr.keys())
    t_s_i = list(range(len(t_s)))
    t_s_val = [sh_distr[i] for i in sh_distr.keys()]
    
    sh_distr = sorted(list(zip(t_s,t_s_val)),key=lambda x:x[1],reverse=True)
    print(f"Shape distribution: {sh_distr}\n")
    return t_sh, t_s, t_s_i, t_s_val

def get_avg_table_sh(X):
    t_sh, t_s, t_s_i, t_s_val = table_shape_stats(X)
    r = sum([a*b for a,b in list(zip([x[0] for x in t_s],t_s_val))])/len(t_sh)
    c = sum([a*b for a,b in list(zip([x[1] for x in t_s],t_s_val))])/len(t_sh)
    print(f"shape: {r}  x  {c}")
    
def cell_stats(X):
    all_cells = flatten_1_deg(flatten_1_deg(X.tolist()))
    print(f"Total cells: {len(all_cells)}, unqiue: {len(list(map(list, set(map(lambda i: tuple(i), all_cells)))))}\n")

    all_cells_len = list(map(lambda i: len(i), all_cells))
    cell_len_distr = Counter(all_cells_len)
    cell_len_distr = sorted(cell_len_distr.items(), key=lambda i: i[0])
    c_len,c_len_val = list(zip(*cell_len_distr))

    print(f"cell_len_distr: {cell_len_distr}")
    return all_cells, all_cells_len, c_len, c_len_val, cell_len_distr

# String with num stat

In [6]:
# str_w_num = [word for word in all_words if contains_num(word)]
# len(str_w_num)/len(all_words),len(list(set(str_w_num)))/len(list(set(all_words))),

# Filter functions for table

In [7]:
def remove_empty_tables(tables):
    e_t = []
    for i in range(len(tables)):
        if np.array(tables[i]).size==0:
            e_t.append(i)
    return np.delete(tables,e_t)  

def remove_empty_cols(table):
    def check_cell_validity(column):
        c = 0
        for i in column:
            if len(i) == 0:
                c += 1
        r = c/len(column)
        if r==1:
            return True
#         elif r >= 0.7 and len(column)>4:
#             return True
        return False
    
    data = np.array(table)
    col = 0
    while(col < data.shape[1]):
        if check_cell_validity(data[:, col]):
            data = np.delete(data, col, 1)
        else:
            col += 1
    return data.tolist()

def remove_empty_rows(table):
    def check_cell_validity(row):
        c = 0
        for i in row:
            if len(i) == 0:
                c += 1
        r = c/len(row)
        if r==1:
            return True
        return False
    
    data = np.array(table)
    row = 0
    while(row < data.shape[0]):
        if check_cell_validity(data[row,:]):
            data = np.delete(data, row, 0)
        else:
            row += 1
    return data.tolist()

def remove_dupl_rows(table):
    t=[]
    for row in table:
        if row not in t:
            t.append(row)
    return t

def remove_dupl_cols(table):
    table = np.array(table)
    if len(table.shape)==3:
        table_t = np.transpose(table,(1,0,2)).tolist()
    elif len(table.shape)==2:
        table_t = np.transpose(table,(1,0)).tolist()
    t=[]
    for row in table_t:
        if row not in t:
            t.append(row)
    if len(table.shape)==3:
        f_table = np.transpose(np.array(t),(1,0,2)).tolist()
    elif len(table.shape)==2:
        f_table = np.transpose(np.array(t),(1,0)).tolist()
    return f_table

def remove_1x1_table(X):
    ts_1 = []
    for i in range(len(X)):
        if np.array(X[i]).shape[:2] == (1,1):
            ts_1.append(i)
    return np.delete(X,ts_1)  

def preprocess(X):
    X = remove_empty_tables(X)
    print(X.shape)
    for i in range(len(X)):
        X[i] = remove_empty_cols(X[i])
    print(X.shape)
    X = remove_empty_tables(X)
    print(X.shape)

    for i in range(len(X)):
        X[i] = remove_empty_rows(X[i])
    print(X.shape)
    X = remove_empty_tables(X)
    print(X.shape)

    for i in range(len(X)):
        X[i] = remove_dupl_cols(X[i])
    print(X.shape)
    X = remove_empty_tables(X)
    print(X.shape)

    for i in range(len(X)):
        X[i] = remove_dupl_rows(X[i])
    print(X.shape)
    X = remove_empty_tables(X)
    print(X.shape)
    return X

# Table split functions

In [8]:
def split_data(data):
    data = np.array(data)
    (row_shape, column_shape) = data.shape[:2]

    blocks_per_row = math.ceil(row_shape/MAX_ROW_LEN)
    blocks_per_column = math.ceil(column_shape/MAX_COL_LEN)
    previous_row = 0
    for row_block in range(blocks_per_row):
        previous_row = row_block * MAX_ROW_LEN
        previous_column = 0
        for column_block in range(blocks_per_column):
            previous_column = column_block * MAX_COL_LEN
            block = data[previous_row:previous_row + MAX_ROW_LEN,
                         previous_column:previous_column + MAX_COL_LEN]
            yield block


def split_overflow_table(j):
    X = []
    numDataRows, numCols = np.array(j).shape[:2]
    
    if numCols > MAX_COL_LEN or numDataRows > MAX_ROW_LEN:
        # print('Splitting the data')
        splits = split_data(j)
        for v in splits:
            if v.size != 0:
                # print('Adding split data')
                X.append(v.tolist())
    else:
        X.append(j)

    return X

# Cleaning function

In [25]:
def clean(table):
    to_rem = ['=','{','}','</', ':#','\\\\','\\','3px']
    to_rep_dash = ['(','),',')','&amp',',_',':_','/_','+_']
    to_rep_sp = ['|','?',':','#','~','$','^','\\n',';','@']
    to_rep_dash_rgx = "\(|\),|,_|\)|&amp|:_|/_|\+_"
    to_rep_sp_rgx = "\||\?|\:|#|~|\$|\^|\\n|;|@"
    
    def clean_cell(cell):
        tmp = []
        for w in cell[:]:
            if any(c in w for c in to_rem):
                cell.remove(w)
            elif any(c in w for c in to_rep_dash) or any(c in w for c in to_rep_sp):
                if any(c in w for c in to_rep_sp):
                    t = re.sub(to_rep_sp_rgx, " ", re.sub(to_rep_dash_rgx, "_", w))
                else:
                    t = re.sub(to_rep_dash_rgx, "_", w)
                nw = ' '.join(list(filter(None,re.split(" _|_ ", t))))
                nw = '_'.join(list(filter(None,re.split("_", nw))))
                tmp.append(nw)
                cell.remove(w)
        cell_ = cell+tmp
        return tokenize_str(" ".join(list(dict.fromkeys(cell_))))
                
#                 for c in to_rep_dash:
#                     if c in w:
#                         if (c=='(' or c=='),'):
#                             nw = '_'.join(list(filter(None, w.replace('(','_').replace('),','_').split('_'))))
#                         else:
#                             nw = '_'.join(list(filter(None, w.replace(c,'_').split('_'))))
#                         tmp.append(tokenize_str(nw))
#                         cell.remove(w)
#                         break
#             elif any(c in w for c in to_rep_sp):
#                 nw = '_'.join(list(filter(None, re.sub(to_rep_sp_rgx, " ", w).split('_')))).strip()
#                 tmp.append(tokenize_str(nw))
#                 cell.remove(w)
#                 for c in to_rep_sp:
#                     if c in w:
#                         #spacy on whole cell and not just on new word
#                         nw = '_'.join(list(filter(None, w.replace(c,' ').split('_'))))
#                         tmp.append(tokenize_str(nw))
#                         cell.remove(w)
#                         break
    
    for row in table:
        for i,cell in enumerate(row):
            row[i] = clean_cell(cell)
    return table

## String without Numbers

In [74]:
PATH = './data/wo_strnum3.0_wo_ent'
tables_subset = loadpkl(f'{PATH}/postive_tables_set.pkl')
read_all_tables = [read_table(js)['data'] for js in tables_subset]
X = loadpkl(f'{PATH}/x_tokenised.pkl')
print(X.shape)

(22932,)


# Cleaning for some spl characters and patterns

In [75]:
%%time
p = Pool(processes=75)
X = p.map(clean, X)
p.close()
p.join()
X = np.array(X)

CPU times: user 57.6 s, sys: 1min 41s, total: 2min 38s
Wall time: 21min 23s


In [76]:
X = preprocess(X)

(21518,)
(21518,)
(21518,)
(21518,)
(21518,)
(21518,)
(21518,)
(21518,)
(21518,)


In [77]:
get_avg_table_sh(X)

Total shapes: 21518, unqiue: 938

Shape distribution: [((1, 1), 1548), ((1, 3), 1289), ((2, 1), 1073), ((1, 2), 717), ((2, 3), 603), ((2, 2), 580), ((4, 1), 531), ((3, 1), 482), ((4, 2), 480), ((3, 3), 474), ((3, 2), 458), ((4, 3), 396), ((5, 1), 382), ((6, 2), 342), ((8, 2), 333), ((5, 2), 324), ((5, 3), 317), ((10, 2), 279), ((6, 3), 276), ((6, 1), 270), ((7, 2), 237), ((8, 3), 223), ((8, 1), 217), ((7, 3), 214), ((2, 4), 210), ((1, 4), 200), ((4, 4), 191), ((9, 2), 190), ((10, 3), 179), ((3, 4), 166), ((7, 1), 165), ((6, 4), 161), ((9, 3), 159), ((10, 1), 154), ((5, 4), 138), ((9, 1), 138), ((12, 3), 129), ((11, 3), 128), ((11, 2), 128), ((12, 2), 127), ((8, 4), 110), ((13, 3), 108), ((9, 4), 101), ((13, 2), 99), ((12, 1), 98), ((10, 4), 95), ((7, 4), 94), ((14, 2), 93), ((15, 2), 87), ((12, 4), 86), ((16, 3), 82), ((15, 3), 75), ((14, 3), 74), ((11, 1), 72), ((2, 5), 71), ((11, 4), 71), ((16, 2), 69), ((14, 4), 65), ((22, 3), 65), ((14, 1), 64), ((6, 5), 64), ((15, 4), 64), ((13, 4

In [78]:
MAX_ROW_LEN = 11
MAX_COL_LEN = 3

# Rejoining and splitting for entity check

In [79]:
# def retokenize2merge_ent(table):
#     for row in table:
#         for i, cell in enumerate(row):
#             if len(cell)>1:
#                 row[i] = tokenize_str(" ".join(list(dict.fromkeys(cell))))
#     return table

In [80]:
# %%time
# p = Pool(processes=50)
# X = p.map(retokenize2merge_ent, X)
# p.close()
# p.join()
# X = np.array(X)

# X = preprocess(X)

## Splitting into smaller blocks

In [81]:
print(X.shape)
X = [split_overflow_table(table) for table in X.tolist()]
X = flatten_1_deg(X)
X = np.array(X)
print(X.shape)

X = preprocess(X)

(21518,)
(48973,)
(47862,)
(47862,)
(47862,)
(47862,)
(47862,)
(47862,)
(47862,)
(47862,)
(47862,)


In [106]:
get_avg_table_sh(X)

Total shapes: 47862, unqiue: 33

Shape distribution: [((11, 3), 7744), ((11, 2), 4901), ((1, 1), 3590), ((11, 1), 2409), ((1, 3), 2269), ((2, 1), 2237), ((2, 3), 1526), ((3, 3), 1428), ((4, 3), 1309), ((2, 2), 1292), ((3, 1), 1280), ((1, 2), 1261), ((4, 1), 1167), ((5, 3), 1115), ((6, 3), 1105), ((3, 2), 972), ((4, 2), 963), ((10, 3), 960), ((5, 1), 921), ((6, 2), 872), ((7, 3), 847), ((8, 3), 821), ((9, 3), 807), ((5, 2), 751), ((10, 2), 750), ((6, 1), 692), ((8, 2), 652), ((7, 2), 584), ((9, 2), 565), ((8, 1), 532), ((10, 1), 530), ((9, 1), 516), ((7, 1), 494)]

shape: 6.4033262295767  x  2.1162299945677154


In [83]:
# print(X.shape)
# X = remove_1x1_table(X)
# print(X.shape)

In [107]:
for i in X:
    if np.array(i).shape[:2]==(1,1):
        print(i)

[[['dance_club_songs']]]
[[['the_murray_hill_theatre_jacksonville_florida']]]
[[['allmusic']]]
[[['total']]]
[[['[bicl']]]
[[['nominated']]]
[[['correct_input']]]
[[['unknown']]]
[[['total']]]
[[['2014_cricket_world_cup_qualifier']]]
[[['one_zone']]]
[[['attention_attention_deficit_slow_sluggish_lethargic_feeling']]]
[[['retirement']]]
[[['retirement']]]
[[['retirement']]]
[[['retirement']]]
[[['retirement']]]
[[['retirement']]]
[[['resignation']]]
[[['partial']]]
[[['reporting']]]
[[['unknown']]]
[[['latin_pop_albums']]]
[[['uk_singles_chart']]]
[[['george_susce_catcher']]]
[[['dismantled']]]
[[['bombed']]]
[[['south_korea_national_football_team']]]
[[['second_city_derby']]]
[[['north_london_derby']]]
[[['steel_city_derby']]]
[[['isbn']]]
[[['spanish_language']]]
[[['noaa']]]
[[['denmark']]]
[[['a.c._milan']]]
[[['retrieved']]]
[[['listed']]]
[[['30,000_maximum']]]
[[['canterbury_museum']]]
[[['corpus_christi_texas']]]
[[['london_ontario']]]
[[['fomalhaut']]]
[[['penalty_shoot-out_ass

[[['subscription']]]
[[['allmusic']]]
[[['elected']]]
[[['uric']]]
[[['allmusic']]]
[[['u.s._billboard']]]
[[['burlesque']]]
[[['webisodes_documentary']]]
[[['latroy_hawkins']]]
[[['nominated']]]
[[['nominated']]]
[[['nominated']]]
[[['player-coach']]]
[[['treasure_buddies']]]
[[['allmusic']]]
[[['u.s._billboard']]]
[[['million_pounds']]]
[[['cancelled']]]
[[['unknown']]]
[[['2011_imola_gp2_asia_series_round']]]
[[['pw545b']]]
[[['sepnov_jan_maraug']]]
[[['1936_1936_dot']]]
[[['recap']]]
[[['georgina_jackson']]]
[[['sister']]]
[[['12_bonus_track']]]
[[['stade_dudelange']]]
[[['allmusic']]]
[[['allmusic']]]
[[['public_vote']]]
[[['guangzhou_gac']]]
[[['kenan_an_english-born_israeli-american_director']]]
[[['allmusic']]]
[[['allmusic']]]
[[["united_states_men's_national_basketball_team"]]]
[[['numbering']]]
[[['total']]]
[[['turkey_fed_cup_team']]]
[[['the_topix']]]
[[['olympiacos_f.c']]]
[[['slovakia']]]
[[['federazione_industria_musicale_italiana']]]
[[['southern_terminus']]]
[[['dave_

[[['website']]]
[[['source_weatherbase']]]
[[['southeast_division']]]
[[['southeast_division']]]
[[['rpm_magazine']]]
[[['no_time']]]
[[['no_time']]]
[[['107%_time']]]
[[['indonesia']]]
[[['philippines']]]
[[['allmusic']]]
[[['allmusic']]]
[[['pain']]]
[[['association_of_hungarian_record_companies']]]
[[['allmusic']]]
[[['national_population_and_family_planning_commission']]]
[[['national_championship_conference_title_conference_division_title']]]
[[['national_college_baseball_hall_of_fame']]]
[[['2013_central_american_and_caribbean_championships_in_athletics__results']]]
[[['demolition/redevelopment']]]
[[['demolished']]]
[[['building_homepage']]]
[[['completed']]]
[[['regular_season_champion_conference_tournament_champion_conference_regular_season_and_conference_tournament_champion_conference_division_champion']]]
[[['regular_season_champion_conference_tournament_champion_conference_regular_season_and_conference_tournament_champion_conference_division_champion']]]
[[['live']]]
[[['to

[[['oliver_kahn']]]
[[['1976_spanish_motorcycle_grand_prix']]]
[[['bishop_of_buckingham']]]
[[['report']]]
[[['debut_movie']]]
[[['public_policy_polling']]]
[[['pacific_swallow']]]
[[['paul_ayme']]]
[[['damiris_dantas']]]
[[['the_entire_aegean_sea']]]
[[['official_results_le_directeur_general_des_elections_du_quebec']]]
[[['turnout_approx']]]
[[['ldu_quito']]]
[[['nominated']]]
[[['nominated']]]
[[['nominated']]]
[[['nominated']]]
[[['nominated']]]
[[['nominated']]]
[[['planet_sound']]]
[[['royal_blue_accents']]]
[[['everton_f.c']]]
[[['french_top_albums']]]
[[['the_numbers_browser_market_share']]]
[[['cemetery_end']]]
[[['listed']]]
[[['erected']]]
[[['2011_fia_gt1_san_luis_round']]]
[[['chennai_edition']]]
[[['chennai_edition']]]
[[['retrieved']]]
[[['allmusic']]]
[[['stefano_intini']]]
[[['brendon_enzo_hayden']]]
[[['leovegildo_lins_da_gama_junior']]]
[[['noaa']]]
[[['allmusic']]]
[[['aric_almirola']]]
[[['nominated']]]
[[["south_korea_men's_national_ice_hockey_team"]]]
[[['first_me

In [85]:
def shrink_cell_len(table):
    for row in table:
        for i,cell in enumerate(row):
            if len(cell)>1:
                row[i] = [cell[-1]]
    return table

In [86]:
for i,table in enumerate(X):
    X[i] = shrink_cell_len(table)

In [93]:
savepkl(f'{PATH}/x_tokenised_preprocessed.pkl',X)

In [100]:
X = loadpkl(f'{PATH}/x_tokenised_preprocessed.pkl')

# Vocab

In [101]:
all_cells, all_cells_len, c_len, c_len_val, cell_len_distr = cell_stats(X)

Total cells: 685056, unqiue: 295986

cell_len_distr: [(0, 73214), (1, 611842)]


In [102]:
sum([a*b for a,b in cell_len_distr[1:]])/(len(all_cells)-cell_len_distr[0][1])

1.0

In [105]:
s = 0
for l,v in cell_len_distr:
    if l==1:
        s+=v
s/len(all_cells)

0.8931269852391629

In [104]:
all_words = flatten_1_deg(all_cells)
len(all_words),len(list(set(all_words)))

(611842, 295985)

In [92]:
s = 0
# l_c = ['|','?',':','#','~','$','^','\\n',';','@'] #replace with space
# l_c = ['=','{','}','</span', ':#','\\\\','\\'] # remove alltogether
# l_c = ['(',')','&amp',',_','/_','+_'] # replace with _
# l_c = ['/'] # dont know abhi
l_c = ['low_risk']
for i in all_words:
    if any(c in i for c in l_c):
        s+=1
        print(s,i)

1 760_low_risk


In [94]:
baseline_f = pd.read_csv('../global_data/features.csv')

def generate_vocab(X):
    result = flatten_1_deg(flatten_1_deg(flatten_1_deg(X.tolist())))
    print(f"table only vocab: {len(result)}, {len(list(set(result)))}")
    query_l = [tokenize_str(i.lower()) for i in list(baseline_f['query'].unique())]
    query_l = flatten_1_deg(query_l)
    result += query_l
    # print(result[:10])
    count = Counter(result)
    c = [[i, count[i]] for i in count.keys()]
    df = pd.DataFrame(c)
    df.sort_values(by=[1], ascending=False, inplace=True)
    df.to_csv(f'{PATH}/word_distr.csv', index=False, columns=None)

    vocab = list(set(count.keys()))
    vocab.insert(0, '<PAD>')
    vocab.insert(0, '<UNK>')
    print(f'total vocab: {len(vocab)}\n')
    savepkl(
        f'{PATH}/vocab_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', vocab)

In [95]:
generate_vocab(X)

table only vocab: 611842, 295985
total vocab: 296037



# Padding

In [53]:
def pad_table(table,val='<PAD>'):
    rows,cols = np.array(table).shape[:2]
    cols2fill = MAX_COL_LEN - cols
    rows2fill = MAX_ROW_LEN - rows
#     print(rows,cols)
    
    for r in table:
        for cell in r:
            if len(cell)==0:
                cell.append(val)
                
    full_t = np.full((MAX_ROW_LEN,MAX_COL_LEN,1), val).tolist()
    for i in range(int(rows2fill/2),int(rows2fill/2)+rows):
        full_t[i][int(cols2fill/2):int(cols2fill/2)+cols] = table[i-int(rows2fill/2)]
    return full_t    

        
        
#     for row in table:
#         for cell in row:
#             if len(cell)==0:
#                 cell.append(val)
#         for i in range(0,int(cols2fill/2)):
#             row.insert(0,[val])
#         for i in range(0, cols2fill-int(cols2fill/2)):
#             row.append([val])
    
#     for i in range(0,int(rows2fill/2)):
#         table.insert(0,[[val]] * MAX_COL_LEN)
#     for i in range(0, rows2fill-int(rows2fill/2)):
#         table.append([[val]] * MAX_COL_LEN)
                
#     for row in table:
#         for cell in row:
#             if len(cell) == 0:
#                 cell.append('<PAD>')
#         cols2fill = MAX_COL_LEN-len(row)
#         for j in range(0, cols2fill):
#             row.append(['<PAD>'])
#     for i in range(0, MAX_ROW_LEN-rows):
#         table.append(
#             [['<PAD>']] * MAX_COL_LEN)


In [54]:
X[101]

[[['united_states']], [['denmark']], [['france']], [['sweden']], [['finland']]]

In [55]:
pad_table(X[101])

[[['<PAD>'], ['<PAD>'], ['<PAD>']],
 [['<PAD>'], ['<PAD>'], ['<PAD>']],
 [['<PAD>'], ['<PAD>'], ['<PAD>']],
 [['<PAD>'], ['united_states'], ['<PAD>']],
 [['<PAD>'], ['denmark'], ['<PAD>']],
 [['<PAD>'], ['france'], ['<PAD>']],
 [['<PAD>'], ['sweden'], ['<PAD>']],
 [['<PAD>'], ['finland'], ['<PAD>']],
 [['<PAD>'], ['<PAD>'], ['<PAD>']],
 [['<PAD>'], ['<PAD>'], ['<PAD>']],
 [['<PAD>'], ['<PAD>'], ['<PAD>']]]

In [56]:
%%time
p = Pool(processes=10)
X_pad = p.map(pad_table, X)
p.close()
p.join()
X_pad = np.array(X_pad)
print(X_pad.shape)

(47862, 11, 3, 1)
CPU times: user 3.72 s, sys: 1.45 s, total: 5.17 s
Wall time: 5.28 s


In [57]:
savepkl(f'{PATH}/x_tokenised_preprocessed_pad.pkl', X_pad)

# Baseline conversion

In [96]:
baseline_f = pd.read_csv('../global_data/features.csv')

In [97]:
def t(baseline_f):
    baseline_f['table_tkn'] = baseline_f.table_id.apply(
        lambda x: shrink_cell_len(clean(tokenize_table(read_table(x)['data']))))
    baseline_f['query_tkn'] = baseline_f['query'].apply(
        lambda x: tokenize_str(x.lower()))
    return baseline_f

In [98]:
%%time
baseline_f_ = mp(baseline_f, t, 70)

CPU times: user 22.3 s, sys: 45.7 s, total: 1min 7s
Wall time: 10min 3s


In [73]:
baseline_f_.to_csv(f'{PATH}/baseline_f_tq-tkn.csv', index=False)

In [99]:
baseline_f_['table_tkn'][0]

[[['760_low_risk'], []],
 [[], []],
 [[], []],
 [[], []],
 [[], []],
 [[], []],
 [['540_high_risk'], []],
 [['no_credit_rating'], ['the_lender']]]

In [None]:
remove_empty_cols
remove_empty_rows
remove_dupl_cols
remove_dupl_rows

# Testing

In [None]:
ALL_TABLES_PATH_ORG = '../global_data/tables_redi2_1/'
OUTPUT_DIR = '../global_data/all_tables'
all_tables = os.listdir(OUTPUT_DIR)

In [None]:
baseline_f = pd.read_csv('../global_data/features.csv')
tables_subset_3k = list(baseline_f['table_id'])
tables_subset = list(set(tables_subset_3k+random.sample(all_tables, 20000)))

In [None]:
read_all_tables = [read_table(js)['data'] for js in tables_subset]
print(len(read_all_tables))
read_all_tables = remove_empty_tables(read_all_tables)
print(len(read_all_tables))

In [None]:
all_cells = flatten_1_deg(flatten_1_deg(read_all_tables))
all_unq_cells = list(set(all_cells))

In [None]:
cell2i = dict(zip(range(len(all_unq_cells)),all_unq_cells))

In [None]:
def f(cell):
    return list(cell2i.keys())[list(cell2i.values()).index(cell)]
nf = np.vectorize(f)
def tables2i(table):
    npt = np.array(table)
    return nf(npt).tolist()
#     for row in table:
#         for i,cell in enumerate(row):
#             row[i] = f(cell)
#     return table

In [None]:
%%time
p = Pool(processes=70)
X = p.map(tables2i, read_all_tables)
p.close()
p.join()