In [1]:
import json 
import pandas as pd
import csv 
import pickle
import numpy as np
import re
import os
import gzip
from tqdm import tqdm

In [2]:
cpa_vocab = open("./cpa_vocab.txt", 'r')
cpa_vocab = [line.replace('\n', '').split('\t')[1] for line in cpa_vocab.readlines()]  
len(cpa_vocab) #121

121

In [3]:
def output_cpa_label_ids(label_list):
    y = [0] * len(cpa_vocab)
    for i in label_list:
        y[cpa_vocab.index(i)] = 1
    return y

In [4]:
df_ori_cpa_train = pd.read_json('Output/train.table_rel_extraction.json', orient='records')
df_ori_cpa_train
df_ori_cpa_dev = pd.read_json('Output/dev.table_rel_extraction.json', orient='records')
df_ori_cpa_dev
df_ori_cpa_test = pd.read_json('Output/test.table_rel_extraction.json', orient='records')
df_ori_cpa_test

Unnamed: 0,0,1,2,3,4,5,6,7
0,670791-1,yukon legislative assembly,670791,current members,,"[name, party]","[[[[0, 0], [33384061, 'Kevin Barr']], [[1, 0],...",[[government.politician.party-government.polit...
1,37797825-1,list of songs recorded by led zeppelin,37797825,,references,"[song, writer(s)]","[[[[0, 0], [1727257, '"" Achilles Last Stand ""'...",[[music.composition.composer]]
2,33863640-1,list of air india fc managers,33863640,statistics,,"[name, nationality]","[[[[0, 0], [39919400, 'Bimal Ghosh']], [[2, 0]...","[[people.person.nationality, people.person.pla..."
3,56602-2,astana,56602,sport,,"[club, sport, league]","[[[[0, 0], [21491213, 'Astana F.C.']], [[1, 0]...","[[sports.sports_team.sport], [sports.sports_te..."
4,29014813-1,northwestern mexico,29014813,states,,"[state, capital, largest city]","[[[[0, 0], [199793, 'Baja California']], [[1, ...",[[location.administrative_division.capital-loc...
...,...,...,...,...,...,...,...,...
1462,19783157-1,list of house members of the 40th parliament o...,19783157,alberta,,"[name, electoral district]","[[[[0, 0], [2266595, 'Lee Richardson']], [[1, ...",[[government.politician.government_positions_h...
1463,1276543-4,balaji telefilms,1276543,film productions,,"[title, director(s)]","[[[[0, 0], [10140052, 'Kyo Kii... Main Jhuth N...","[[film.film.directed_by, award.award_nominated..."
1464,23000314-2,list of fantasy films of the 2010s,23000314,forthcoming,,"[title, director, cast]","[[[[2, 0], [29769606, '47 Ronin']], [[3, 0], [...",[[award.award_nominated_work.award_nominations...
1465,808980-1,list of rulers of austria,808980,house of babenberg,,"[name, birth, marriages, death]","[[[[0, 0], [1100652, 'Leopold I the Illustriou...","[[people.person.parents], [people.person.spous..."


In [5]:
def clean_text(text):
    # this join may cause problem
    if(isinstance(text, dict)):
        text = ' '.join([ clean_text(v) for k, v in text.items()] )
    elif(isinstance(text, list)):
        text = map(clean_text, text)
        text = ' '.join(text)
        
    if pd.isnull(text):
        return ''
    
    #Remove non-ASCII characters
    text = re.sub(r"[^\x00-\x7F]+", " ", str(text))
    
    #Remove excess whitespaces
    text = re.sub(' +', ' ', str(text)).strip()
    
    return text

def create_table(table_id, data, labels, split, csv_list):
    curr_dict = {}
    for i in data:
        for j in i:
            curr_ele = j[0]
            curr_col = curr_ele[1]
            curr_row = curr_ele[0]
            curr_val = j[1][1]

            if curr_dict.get(curr_col) == None: 
                curr_dict[curr_col] = []
                curr_dict[curr_col].append([curr_row, curr_val])
            else:
                curr_dict[curr_col].append([curr_row, curr_val])
                
    sorted_dict = {key: value for key, value in sorted(curr_dict.items())}
    
    new_df_ll = []
    for key in sorted_dict:
        curr_li = curr_dict.get(key)
        curr_row = [i[1] for i in curr_li]
        new_df_ll.append(curr_row)
    new_df = pd.DataFrame(new_df_ll)
    new_df = new_df.transpose()
    
    new_df = new_df.apply(np.vectorize(clean_text))
    
    counter = 1
    for i in labels:
        y = output_cpa_label_ids(i)
        csv_list.append([table_id, counter, y])
        counter += 1
        
    save_to = './turl_cpa_data/{}/'.format(split) + str(table_id) + '.json.gz'
    
    if os.path.exists(save_to):
        pass
    else:
        new_df.to_json(save_to, compression='gzip', orient='records', lines=True)
    return 1

def create_table_and_csv(df, split, csv_list):
    print('Handling {} rows'.format(len(df)))
    for idx, row in tqdm(df.iterrows()):
        create_table(row[0], row[6], row[7], split, csv_list)

In [6]:
train_final_csv_list = []
dev_final_csv_list = []
test_final_csv_list = []
# 
print('Start test')
create_table_and_csv(df_ori_cpa_test, 'Test', test_final_csv_list)
print('Start dev')
create_table_and_csv(df_ori_cpa_dev, 'Validation', dev_final_csv_list)
print('Start train')
create_table_and_csv(df_ori_cpa_train, 'Train', train_final_csv_list)

test_df = pd.DataFrame(test_final_csv_list, columns=['table_id', 'column_index', 'label_ids'])
dev_df = pd.DataFrame(dev_final_csv_list, columns=['table_id', 'column_index', 'label_ids'])
train_df = pd.DataFrame(train_final_csv_list, columns=['table_id', 'column_index', 'label_ids'])

final_dict = {}
final_dict['train'] = train_df
final_dict['dev'] = dev_df
final_dict['test'] = test_df

f = open('./turl_cpa_data/cpa_turl_lm.pkl','wb')
pickle.dump(final_dict,f)
f.close()

Start test
Handling 1467 rows


1467it [00:03, 458.49it/s]


Start dev
Handling 1560 rows


1560it [00:03, 466.59it/s]


Start train
Handling 52943 rows


52943it [01:48, 487.43it/s]


In [7]:
test_df

Unnamed: 0,table_id,column_index,label_ids
0,670791-1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,37797825-1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,33863640-1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,56602-2,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,56602-2,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
2067,808980-1,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2068,808980-1,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2069,808980-2,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2070,808980-2,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
df = pd.read_json('./turl_data/Train/27282378-2.json.gz', compression='gzip', lines=True)
print(df)

                                                   0  \
0            Guto Bebb Category:Articles with hCards   
1        Kevin Brennan Category:Articles with hCards   
2         Chris Bryant Category:Articles with hCards   
3          Alun Cairns Category:Articles with hCards   
4         Martin Caton Category:Articles with hCards   
5            Ann Clwyd Category:Articles with hCards   
6        Stephen Crabb Category:Articles with hCards   
7          Wayne David Category:Articles with hCards   
8         David Davies Category:Articles with hCards   
9       Geraint Davies Category:Articles with hCards   
10         Glyn Davies Category:Articles with hCards   
11    Jonathan Edwards Category:Articles with hCards   
12         Chris Evans Category:Articles with hCards   
13      Jonathan Evans Category:Articles with hCards   
14          Paul Flynn Category:Articles with hCards   
15       Hywel Francis Category:Articles with hCards   
16        Nia Griffith Category:Articles with hC