In [1]:
import pandas as pd
# import sienna
# import tqdm

In [None]:
import sys
# setting path
sys.path.append('..')
from utils import clean_text, save_pickle_file

In [None]:
# Path of the SOTAB V2 CTA tables
# Tables can be downloaded at the webpage: https://webdatacommons.org/structureddata/sotab/v2/
table_path = "../../../SOTAB-v2/CTA/"

In [None]:
cta_train_gt = pd.read_csv("data/sets/sotab_v2_cta_train-subset.csv")
cta_val_gt = pd.read_csv("data/sets/sotabv2_cta_validation-subset.csv")
cta_test_gt = pd.read_csv("data/sets/sotabv2_cta_test-subset.csv")

In [None]:
gt = {'train':{}, 'val':{}, 'test':{}}
for index, row in cta_train_gt.iterrows():
    if row["table_name"] not in gt['train']:
        gt['train'][row["table_name"]] = {}
    gt['train'][row["table_name"]][row["column_index"]] = [row["label"], row["column_type"]]

for index, row in cta_val_gt.iterrows():
    if row["table_name"] not in gt['val']:
        gt['val'][row["table_name"]] = {}
    gt['val'][row["table_name"]][row["column_index"]] = [row["label"], row["column_type"]]
    
for index, row in cta_test_gt.iterrows():
    if row["table_name"] not in gt['test']:
        gt['test'][row["table_name"]] = {}
    gt['test'][row["table_name"]][row["column_index"]] = [row["label"], row["column_type"]]

In [None]:
labels_to_text = sienna.load("data/labels_to_text_sotabv2-subsetu-cta.json")

In [None]:
# Markdown Format
def get_table(file_name):
    
    if file_name in cta_train_gt["table_name"].tolist():
        path = f'{table_path}Train/{file_name}'
        split = 'train'
    elif file_name in cta_val_gt["table_name"].tolist():
        path = f'{table_path}Validation/{file_name}'
        split = 'val'
    else:
        path = f'{table_path}Test/{file_name}'
        split = 'test'
    
    df = pd.read_json(path, compression='gzip', lines=True)
    
    ordered_labels = []
    ordered_types = []
    
    for i, _ in enumerate(df.columns):
        if i in gt[split][file_name]:
            ordered_labels.append(gt[split][file_name][i][0])
            ordered_types.append(gt[split][file_name][i][1])
        else:
            # Unlabeled column: added as context
            # Comment all the else section if context not wanted
            ordered_labels.append("")
            ordered_types.append("")
            
    cleaned_columns = []

    for i, c in enumerate(df.columns):
        if i in gt[split][file_name]:
            cleaned_rows = []
            for row in df.iloc[:, i].tolist():
                cleaned = " ".join(clean_text(row).split()[:20]) #select 20 words
                if cleaned != "":
                    cleaned_rows.append(cleaned)
            cleaned_columns.append(cleaned_rows)

        else:
            # Unlabeled column: added as context
            # Comment all the else section if context not wanted
            cleaned_rows = []
            for row in df.iloc[:, i].tolist():
                cleaned = " ".join(clean_text(row).split()[:20]) #select 20 words
                if cleaned != "":
                    cleaned_rows.append(cleaned)
            cleaned_columns.append(cleaned_rows)

    table_list_df = []

    # Add empty if not len 5
    for i, col_rows in enumerate(cleaned_columns):
        if len(col_rows) < 5: # number of rows
            for j in range(5-len(col_rows)):
                cleaned_columns[i].append("")
    for j in range(5):
        new_row = []
        for cleaned_column in cleaned_columns:
            new_row.append(cleaned_column[j])
        table_list_df.append(new_row)

    # Markdown format
    df_new = pd.DataFrame(table_list_df, columns=[ f"Column {i+1}" for i in range(len(cleaned_columns))])
    # uncomment for knowledge generation set (-kg):
    # df_new = pd.DataFrame(table_list_df, columns=[labels_to_text[ordered_labels[i]] for i in range(len(cleaned_columns))])

    table_string = df_new.to_markdown(index=False)
    column_names = [ f"Column {i+1}" for i in range(len(cleaned_columns))]

    return table_string, list(ordered_labels), ordered_types, file_name.split("_")[0], column_names


In [None]:
train_examples = []
for table in tqdm.tqdm(gt['train'], total=len(gt['train'])):
    tab_str, labels, types, domains, col_names = get_table(table)
    train_examples.append([table, tab_str, labels, domains, types, col_names])

  0%|          | 0/698 [00:00<?, ?it/s]

100%|██████████| 698/698 [00:33<00:00, 20.90it/s]


In [None]:
val_examples = []
for table in tqdm.tqdm(gt['val'], total=len(gt['val'])):
    tab_str, labels, types, domains, col_names = get_table(table)
    val_examples.append([table, tab_str, labels, domains, types, col_names])

100%|██████████| 257/257 [00:11<00:00, 22.98it/s]


In [None]:
test_examples = []
for table in tqdm.tqdm(gt['test'], total=len(gt['test'])):
    tab_str, labels, types, domains, col_names = get_table(table)
    test_examples.append([table, tab_str, labels, domains, types, col_names])

100%|██████████| 367/367 [00:16<00:00, 22.06it/s]


In [None]:
save_pickle_file("data/sotabv2-subsetu-cta-train-random-20.pkl", train_examples)
save_pickle_file("data/sotabv2-subsetu-cta-val.pkl", val_examples)
save_pickle_file("data/sotabv2-subsetu-cta-test.pkl", test_examples)

In [None]:
# Save knowledge generation set
save_pickle_file("data/sotabv2-subsetu-cta-train-random-20-kg.pkl", train_examples)