In [None]:
import pandas as pd
import sys
# setting path
sys.path.append('..')
from utils import save_pickle_file, clean_text, load_txt_file


In [None]:
# Load gts
cta_train_gt = pd.read_csv("data/sets/limaye_cta_train.csv")
cta_test_gt = pd.read_csv("data/sets/limaye_cta_test.csv")
cta_train_gt.fillna('', inplace=True)
cta_test_gt.fillna('', inplace=True)

In [5]:
gt = {'train':{}, 'test':{}}
for index, row in cta_train_gt.iterrows():
    if row["file_name"] not in gt['train']:
        gt['train'][row["file_name"]] = {}
    gt['train'][row["file_name"]][row["col_index"]] = [ [row["label"]] if row["label_2"] == "" else [row["label"], row["label_2"]], row["label_2"],row["all_labels"]]
    
for index, row in cta_test_gt.iterrows():
    if row["file_name"] not in gt['test']:
        gt['test'][row["file_name"]] = {}
    gt['test'][row["file_name"]][row["col_index"]] = [ [row["label"]] if row["label_2"] == "" else [row["label"], row["label_2"]], row["label_2"],row["all_labels"]]

In [None]:
# Path to the Limaye dataset tables
# Tables can be downloaded at: https://github.com/alan-turing-institute/SemAIDA/tree/master/IJCAI19/SemColHNN_Codes/Limaye
table_path = "../../../Benchmarks/Limaye/tables_instance/"

In [None]:
all_labels = load_txt_file(f"data/limayeu-labels/limayeu_all_labels.txt")
labels_to_text = {label: label for label in all_labels}

In [None]:
train_examples = []

for _, table_row in cta_train_gt.iterrows():
    
    table_df = pd.read_csv(f"{table_path}{table_row['file_name']}.csv",header=None)
    table_df = table_df.dropna(how='all')
    ordered_labels = []
    ordered_types = []
    
    for i, c in enumerate(table_df.columns):
        if i in gt["train"][table_row["file_name"]]:
            ordered_labels.append(gt["train"][table_row["file_name"]][i][0])
            ordered_types.append(gt["train"][table_row["file_name"]][i][1]) # actually second label
        else: # Unlabeled columns added for context
            if len(table_df[[i]].dropna(how="all")):
                ordered_labels.append("")
                ordered_types.append("")


    cleaned_columns = []

    for i, c in enumerate(table_df.columns):
        if i in gt["train"][table_row["file_name"]]:
            cleaned_rows = []
            for row in table_df.iloc[:, i].tolist():
                cleaned = " ".join(clean_text(row).split()[:20]) #select 20 words
                if cleaned != "":
                    cleaned_rows.append(cleaned)
            cleaned_columns.append(cleaned_rows)
        else:
            # Unlabeled columns added for context
            if len(table_df[[i]].dropna(how="all")):
                cleaned_rows = []
                for row in table_df.iloc[:, i].tolist():
                    cleaned = " ".join(clean_text(row).split()[:20]) #select 20 words
                    if cleaned != "":
                        cleaned_rows.append(cleaned)
                cleaned_columns.append(cleaned_rows)

    # Add empty if not len 5
    for i, col_rows in enumerate(cleaned_columns):
        if len(col_rows) < 5: # number of rows
            for j in range(5-len(col_rows)):
                cleaned_columns[i].append("")

    table_list_df = []

    # Create table list for dataframe
    for j in range(5):
        new_row = []
        for cleaned_column in cleaned_columns:
            new_row.append(cleaned_column[j])
        table_list_df.append(new_row)

    # Markdown format
    df_new = pd.DataFrame(table_list_df, columns=[ f"Column {i+1}" for i in range(len(cleaned_columns))])
    # uncomment for knowledge generation set (-kg):
    # df_new = pd.DataFrame(table_list_df, columns=[", ".join([labels_to_text[m] for m in ordered_labels[i] ]) for i in range(len(cleaned_columns))])

    table_string = df_new.to_markdown(index=False)
    column_names = [ f"Column {i+1}" for i in range(len(cleaned_columns))]
        
    train_examples.append([table_row["file_name"], table_string, ordered_labels, "", ordered_types, column_names])

In [None]:
test_examples = []

for _, table_row in cta_test_gt.iterrows():
    
    table_df = pd.read_csv(f"{table_path}{table_row['file_name']}.csv",header=None)
    table_df = table_df.dropna(how='all')
    ordered_labels = []
    ordered_types = []
    
    for i, c in enumerate(table_df.columns):
        if i in gt["test"][table_row["file_name"]]:
            ordered_labels.append(gt["test"][table_row["file_name"]][i][0])
            ordered_types.append(gt["test"][table_row["file_name"]][i][1]) # actually second label
        else:
            if len(table_df[[i]].dropna(how="all")):
                ordered_labels.append("")
                ordered_types.append("")

    cleaned_columns = []

    for i, c in enumerate(table_df.columns):
        if i in gt["test"][table_row["file_name"]]:
            cleaned_rows = []
            for row in table_df.iloc[:, i].tolist():
                cleaned = " ".join(clean_text(row).split()[:20]) #select 20 words
                if cleaned != "":
                    cleaned_rows.append(cleaned)
            cleaned_columns.append(cleaned_rows)
        else:
            # Unlabeled column
            if len(table_df[[i]].dropna(how="all")):
                cleaned_rows = []
                for row in table_df.iloc[:, i].tolist():
                    cleaned = " ".join(clean_text(row).split()[:20]) #select 20 words
                    if cleaned != "":
                        cleaned_rows.append(cleaned)
                cleaned_columns.append(cleaned_rows)

    # Add empty if not len 5
    for i, col_rows in enumerate(cleaned_columns):
        if len(col_rows) < 5: # number of rows
            for j in range(5-len(col_rows)):
                cleaned_columns[i].append("")

    table_list_df = []

    # Create table list for dataframe
    for j in range(5):
        new_row = []
        for cleaned_column in cleaned_columns:
            new_row.append(cleaned_column[j])
        table_list_df.append(new_row)

    # Markdown format
    # Create the new dataframe
    df_new = pd.DataFrame(table_list_df, columns=[ f"Column {i+1}" for i in range(len(cleaned_columns))])
    # uncomment for knowledge generation set (-kg):
    # df_new = pd.DataFrame(table_list_df, columns=[labels_to_text[ordered_labels[i]] for i in range(len(cleaned_columns))])

    table_string = df_new.to_markdown(index=False)
    column_names = [ f"Column {i+1}" for i in range(len(cleaned_columns))]
        
    test_examples.append([table_row["file_name"], table_string, ordered_labels, "", ordered_types, column_names])

In [15]:
print(f"Train examples: {len(train_examples)}, Test examples: {len(test_examples)}")

Train examples: 105, Test examples: 107


In [None]:
save_pickle_file("../data/limayeu-cta-train.pkl",train_examples)
save_pickle_file("../data/limayeu-cta-test.pkl",test_examples)

In [None]:
save_pickle_file("../data/limayeu-cta-train-kg.pkl",train_examples)