In [1]:
import pandas as pd
import re
import multiprocessing

In [2]:
#Read the ground truth files
cta_train_gt = pd.read_csv('data/CTA/CTA_training_gt.csv')
cta_val_gt = pd.read_csv('data/CTA/CTA_validation_gt.csv')
cta_test_gt = pd.read_csv('data/CTA/CTA_test_gt.csv')

In [3]:
cta_train_gt

Unnamed: 0,table_name,column_index,label
0,Product_corememoriesco.com_September2020_CTA.j...,5,currency
1,Product_corememoriesco.com_September2020_CTA.j...,0,Product/name
2,Product_corememoriesco.com_September2020_CTA.j...,3,price
3,CreativeWork_paintout.org_September2020_CTA.js...,2,DateTime
4,Product_michaelirvine.com_September2020_CTA.js...,4,currency
...,...,...,...
130466,Event_kivasports.net_September2020_CTA.json.gz,1,Event/name
130467,Event_kivasports.net_September2020_CTA.json.gz,5,telephone
130468,Event_kivasports.net_September2020_CTA.json.gz,6,Place
130469,Event_kivasports.net_September2020_CTA.json.gz,13,telephone


In [4]:
cta_train_cols = (cta_train_gt['table_name'] + '|' + cta_train_gt['column_index'].map(str) + '|' + cta_train_gt['label']).tolist()
cta_val_cols = (cta_val_gt['table_name'] + '|' + cta_val_gt['column_index'].map(str) + '|' + cta_val_gt['label']).tolist()
cta_test_cols = (cta_test_gt['table_name'] + '|' + cta_test_gt['column_index'].map(str) + '|' + cta_test_gt['label']).tolist()

In [5]:
#Read type vocabulary
types_file = open("data/type_vocab.txt", 'r')
type_labels = [line.replace('\n', '').split('\t')[1] for line in types_file.readlines()]

In [6]:
#Simple Preprocessing

def clean_text(text):
        
    if(isinstance(text, dict)):
        text = ' '.join([ clean_text(v) for k, v in text.items()] )
    elif(isinstance(text, list)):
        text = map(clean_text, text)
        text = ' '.join(text)
        
    if pd.isnull(text):
        return ''
    
    #Remove non-ASCII characters
    text = re.sub(r"[^\x00-\x7F]+", " ", str(text))
    
    #Remove excess whitespaces
    text = re.sub(' +', ' ', text).strip()
    
    return text


In [7]:
# Prepare format of input datasets for LM models: table_id, [labels], data, label_ids
def get_table_column(column):
    file_name, column_index, label = column.split('|')
    
    #By column
    if file_name in cta_train_gt['table_name'].tolist():
        path = 'data/CTA/Train/'+file_name
    elif file_name in cta_val_gt['table_name'].tolist():
        path = 'data/CTA/Validation/'+file_name
    else:
        path = 'data/CTA/Test/'+file_name
    
    df = pd.read_json(path, compression='gzip', lines=True)
    
    y = [0] * len(type_labels)
    y[type_labels.index(label)] = 1
    
    return [
        file_name, #table_id
        [label], #[labels]
        clean_text(df.iloc[:, int(column_index)].tolist()), #data #To cut data to 200,000 length [:200000]
        y #label_ids
    ]

In [8]:
pool = multiprocessing.Pool(processes=4)
train_result = pool.map(get_table_column, cta_train_cols)
val_result = pool.map(get_table_column, cta_val_cols)
test_result = pool.map(get_table_column, cta_test_cols)
pool.close()
pool.join()

In [10]:
pd.DataFrame(train_result, columns=['table_id', 'labels', 'data', 'label_ids']).to_csv('data/CTA/cta_train_lm.csv.gz', compression='gzip')
pd.DataFrame(val_result, columns=['table_id', 'labels', 'data', 'label_ids']).to_csv('data/CTA/cta_val_lm.csv.gz', compression='gzip')
pd.DataFrame(test_result, columns=['table_id', 'labels', 'data', 'label_ids']).to_csv('data/CTA/cta_test_lm.csv.gz', compression='gzip')

In [11]:
cta = {}
cta['train'] = pd.DataFrame(train_result, columns=['table_id', 'labels', 'data', 'label_ids'])
cta['dev'] = pd.DataFrame(val_result, columns=['table_id', 'labels', 'data', 'label_ids'])
cta['test'] = pd.DataFrame(test_result, columns=['table_id', 'labels', 'data', 'label_ids'])

In [13]:
cta['train'].to_json('data/CTA/cta_train_lm.json', orient='values')

In [12]:
import pickle
file_name='data/CTA/cta_lm.pkl'
f = open(file_name,'wb')
pickle.dump(cta,f)
f.close()