In [1]:
import pandas as pd
import re
import multiprocessing

In [2]:
#Read the ground truth files
cpa_train_gt = pd.read_csv('data/CPA/CPA_training_gt.csv')
cpa_val_gt = pd.read_csv('data/CPA/CPA_validation_gt.csv')
cpa_test_gt = pd.read_csv('data/CPA/CPA_test_gt.csv')

In [3]:
cpa_train_gt

Unnamed: 0,table_name,main_column_index,column_index,label
0,Product_michaelirvine.com_September2020_CPA.js...,0,6,priceValidUntil
1,Product_michaelirvine.com_September2020_CPA.js...,0,1,description
2,Product_michaelirvine.com_September2020_CPA.js...,0,2,brand
3,Product_michaelirvine.com_September2020_CPA.js...,0,3,mpn
4,Product_michaelirvine.com_September2020_CPA.js...,0,4,priceCurrency
...,...,...,...,...
134420,CreativeWork_millfield.ie_September2020_CPA.js...,0,6,datePublished
134421,LocalBusiness_tailoredliving.com_September2020...,0,4,areaServed
134422,Person_altschools.net_September2020_CPA.json.gz,0,2,faxNumber
134423,Product_idealpatient.com_September2020_CPA.jso...,0,7,itemReviewed


In [4]:
cpa_train_cols = (cpa_train_gt['table_name'] + '|' + cpa_train_gt['column_index'].map(str) + '|' + cpa_train_gt['label']).tolist()
cpa_val_cols = (cpa_val_gt['table_name'] + '|' + cpa_val_gt['column_index'].map(str) + '|' + cpa_val_gt['label']).tolist()
cpa_test_cols = (cpa_test_gt['table_name'] + '|' + cpa_test_gt['column_index'].map(str) + '|' + cpa_test_gt['label']).tolist()

In [5]:
#Read relation vocabulary
types_file = open("data/relation_vocab.txt", 'r')
type_labels = [line.replace('\n', '').split('\t')[1] for line in types_file.readlines()]

In [6]:
#Simple Preprocessing

def clean_text(text):
    
    if(isinstance(text, dict)):
        text = ' '.join([ clean_text(v) for k, v in text.items()] )
    elif(isinstance(text, list)):
        text = map(clean_text, text)
        text = ' '.join(text)
        
    if pd.isnull(text):
        return ''
    
    #Remove non-ASCII characters
    text = re.sub(r"[^\x00-\x7F]+", " ", str(text))
    
    #Remove excess whitespaces
    text = re.sub(' +', ' ', text).strip()
    
    return text


In [9]:
# Prepare format of input datasets for LM models: table_id, [labels], data, label_ids
def get_table_column(column):
    file_name, column_index, label = column.split('|')
    
    #By column
    if file_name in cpa_train_gt['table_name'].tolist():
        path = 'data/CPA/Train/'+file_name
    elif file_name in cpa_val_gt['table_name'].tolist():
        path = 'data/CPA/Validation/'+file_name
    else:
        path = 'data/CPA/Test/'+file_name

    df = pd.read_json(path, compression='gzip', lines=True)
    
    y = [0] * len(type_labels)
    y[type_labels.index(label)] = 1

    return [
        file_name, #table_id
        column_index, #column_id
        [label], #[labels]
        clean_text(df.iloc[:, int(column_index)].tolist()), #data
        y #label_ids
    ]

In [10]:
pool = multiprocessing.Pool(processes=4)
train_result = pool.map(get_table_column, cpa_train_cols)
val_result = pool.map(get_table_column, cpa_val_cols)
test_result = pool.map(get_table_column, cpa_test_cols)
pool.close()
pool.join()

In [21]:
def get_main_column(file_name):
    
    if file_name in cpa_train_gt['table_name'].tolist():
        path = 'data/CPA/Train/'+file_name
    elif file_name in cpa_val_gt['table_name'].tolist():
        path = 'data/CPA/Validation/'+file_name
    else:
        path = 'data/CPA/Test/'+file_name
    
    #Open table
    df = pd.read_json(path, compression='gzip', lines=True)
    y = [0] * len(type_labels)
        
    return [
        file_name, #table_id
        0, #main column index
        [], #[labels]
        clean_text(df.iloc[0].tolist()), #data
        y #label_ids
    ] 

In [22]:
pool = multiprocessing.Pool(processes=4)
train_main_cols = pool.map(get_main_column, cpa_train_gt['table_name'].unique())
val_main_cols = pool.map(get_main_column, cpa_val_gt['table_name'].unique())
test_main_cols = pool.map(get_main_column, cpa_test_gt['table_name'].unique())
pool.close()
pool.join()

In [24]:
cpa = {}
cpa['train'] = pd.DataFrame(train_result+train_main_cols, columns=['table_id', 'column_id', 'labels', 'data', 'label_ids'])
cpa['dev'] = pd.DataFrame(val_result+val_main_cols, columns=['table_id', 'column_id', 'labels', 'data', 'label_ids'])
cpa['test'] = pd.DataFrame(test_result+test_main_cols, columns=['table_id', 'column_id', 'labels', 'data', 'label_ids'])

In [26]:
import pickle
file_name='data/CPA/cpa_lm.pkl'
f = open(file_name,'wb')
pickle.dump(cpa,f)
f.close()