In [None]:
import pandas as pd
import tqdm
import sienna

In [2]:
import sys
# setting path
sys.path.append('..')
from utils import clean_text, save_pickle_file

In [None]:
def eval_df(df):
    df["labels"] = df["labels"].apply(lambda x: eval(x))
    df["table"] = df["table"].apply(lambda x: eval(x))
    df["headers"] = df["headers"].apply(lambda x: eval(x))
    return df                            

In [None]:
cta_train_gt = pd.read_csv('data/sets/train_wikitables_subset_20.csv')
cta_dev_gt = pd.read_csv('data/sets/dev_wikitables_subset_2.csv')
cta_test_gt = pd.read_csv('data/sets/test_wikitables_subset_2.csv')

cta_train_gt = eval_df(cta_train_gt)
cta_dev_gt = eval_df(cta_dev_gt)
cta_test_gt = eval_df(cta_test_gt)

In [None]:
labels_to_text = sienna.load("data/labels_to_text_wikitables-2-cta.json")

In [None]:
train = []
for index, row in tqdm.tqdm(cta_train_gt.iterrows(), total=len(cta_train_gt)):

    table = row["table"] #table values
    ordered_labels = row["labels"] # column labels

    cleaned_columns = []
    for column_index, c in enumerate(table):
        cleaned_rows = []
        for row_ in table[column_index]:
            cleaned = " ".join(clean_text(row_[1][1]).split()[:20])
            if cleaned != "":
                cleaned_rows.append(cleaned)
        cleaned_columns.append(cleaned_rows)

    table_list_df = []
    # Add empty if not len 5
    for i, col_rows in enumerate(cleaned_columns):
        if len(col_rows) < 5: # number of rows
            for j in range(5-len(col_rows)):
                cleaned_columns[i].append("")
    
    # Create table list for dataframe
    for j in range(5):
        new_row = []
        for cleaned_column in cleaned_columns:
            new_row.append(cleaned_column[j])
        table_list_df.append(new_row)

    # Markdown format
    df_new = pd.DataFrame(table_list_df, columns=[ f"Column {i+1}" for i in range(len(cleaned_columns))])
    # uncomment for knowledge generation set (-kg):
    # df_new = pd.DataFrame(table_list_df, columns=[", ".join([labels_to_text[l] for l in ordered_labels[i]])  for i in range(len(cleaned_columns))])

    df_num = df_new.select_dtypes(include=['number'])

    column_types = []

    for column_name in df_new.columns:
        if column_name in df_num.columns:
            column_types.append("numerical")
        else:
            column_types.append("textual")

    table_string = df_new.to_markdown(index=False)
    column_names = [ f"Column {i+1}" for i in range(len(cleaned_columns))]
    
    train.append([row["table_name"], table_string, ordered_labels, eval(row["domains"]), column_types, row["page_title"], row["section_title"], row["headers"], column_names ])

100%|██████████| 809/809 [00:00<00:00, 1116.57it/s]


In [None]:
dev = []
for index, row in tqdm.tqdm(cta_dev_gt.iterrows(), total=len(cta_dev_gt)):

    table = row["table"] #table values
    ordered_labels = row["labels"] # column labels

    cleaned_columns = []
    for column_index, c in enumerate(table):
        cleaned_rows = []
        for row_ in table[column_index]:
            cleaned = " ".join(clean_text(row_[1][1]).split()[:20])
            if cleaned != "":
                cleaned_rows.append(cleaned)
        cleaned_columns.append(cleaned_rows)

    table_list_df = []
    # Add empty if not len 5
    for i, col_rows in enumerate(cleaned_columns):
        if len(col_rows) < 5: # number of rows
            for j in range(5-len(col_rows)):
                cleaned_columns[i].append("")
    
    # Create table list for dataframe
    for j in range(5):
        new_row = []
        for cleaned_column in cleaned_columns:
            new_row.append(cleaned_column[j])
        table_list_df.append(new_row)

    # Markdown format
    df_new = pd.DataFrame(table_list_df, columns=[ f"Column {i+1}" for i in range(len(cleaned_columns))])
    df_num = df_new.select_dtypes(include=['number'])

    column_types = []

    for column_name in df_new.columns:
        if column_name in df_num.columns:
            column_types.append("numerical")
        else:
            column_types.append("textual")

    table_string = df_new.to_markdown(index=False)
    column_names = [ f"Column {i+1}" for i in range(len(cleaned_columns))]
    
    dev.append([row["table_name"], table_string, ordered_labels, eval(row["domains"]), column_types, row["page_title"], row["section_title"], row["headers"], column_names])

100%|██████████| 416/416 [00:00<00:00, 1001.55it/s]


In [None]:
test = []
for index, row in tqdm.tqdm(cta_test_gt.iterrows(), total=len(cta_test_gt)):

    table = row["table"] #table values
    ordered_labels = row["labels"] # column labels

    cleaned_columns = []
    for column_index, c in enumerate(table):
        cleaned_rows = []
        for row_ in table[column_index]:
            cleaned = " ".join(clean_text(row_[1][1]).split()[:20])
            if cleaned != "":
                cleaned_rows.append(cleaned)
        cleaned_columns.append(cleaned_rows)

    table_list_df = []
    # Add empty if not len 5
    for i, col_rows in enumerate(cleaned_columns):
        if len(col_rows) < 5: # number of rows
            for j in range(5-len(col_rows)):
                cleaned_columns[i].append("")
    
    # Create table list for dataframe
    for j in range(5):
        new_row = []
        for cleaned_column in cleaned_columns:
            new_row.append(cleaned_column[j])
        table_list_df.append(new_row)

    # Markdown format
    df_new = pd.DataFrame(table_list_df, columns=[ f"Column {i+1}" for i in range(len(cleaned_columns))])
    df_num = df_new.select_dtypes(include=['number'])

    column_types = []

    for column_name in df_new.columns:
        if column_name in df_num.columns:
            column_types.append("numerical")
        else:
            column_types.append("textual")

    table_string = df_new.to_markdown(index=False)
    column_names = [ f"Column {i+1}" for i in range(len(cleaned_columns))]
    
    test.append([row["table_name"], table_string, ordered_labels, eval(row["domains"]), column_types, row["page_title"], row["section_title"], row["headers"], column_names])

100%|██████████| 379/379 [00:00<00:00, 707.10it/s]


In [None]:
# save_pickle_file('data/wikitables-2-cta-train-random-20-kg.pkl',train)
save_pickle_file('data/wikitables-2-cta-train-random-20.pkl',train)
save_pickle_file('data/wikitables-2-cta-val.pkl',dev)
save_pickle_file('data/wikitables-2-cta-test.pkl', test)