In [1]:
import pandas as pd
import os, json, time

# get the path of three data
cwd = os.getcwd()
# only have train and val dataset
train_path = os.path.join(cwd, 'train.txt')
val_path = os.path.join(cwd, 'valid.txt')

def convert_dataset(path: str):
    """
    convert the people-relation style data into the DeepKE friendly style
    :path: the path of the give txt file
    """ 

    s_time = time.time()
    # set the data frame with same column name as the example dataset
    data = pd.DataFrame(columns = ['sentence', 'relation', 'head', 'head_offset', 'tail', 'tail_offset'])
    # TODO: operate the file from multiple processes
    # REF: https://stackoverflow.com/questions/11196367/processing-single-file-from-multiple-processes 
    with open(path, 'r', encoding = 'utf-8') as f:
        for line in f:
            # convert string to dict
            line = json.loads(line)
            # concate the tokens into string
            line['sentence'] = ''.join(line['token'])
            # remove the 'token' key-value pair
            line.pop('token')
            # shape the head and tail
            line['head'] = line['h']['name']
            line['head_offset'] = line['h']['pos'][0]
            line.pop('h')
            line['tail'] = line['t']['name']
            line['tail_offset'] = line['t']['pos'][0]
            line.pop('t')
            # convert the dict to pd.DataFrame
            line = pd.DataFrame.from_dict(line, orient='index').T
            # append the line to data
            data = pd.concat([data, line])
    
    cwd = os.getcwd()
    file_name = path.split(os.sep)[-1].split('.')[0] + '.csv'
    file_path = os.path.join(cwd, file_name)
    data.to_csv(file_path, index = False, encoding = 'utf-8')

    duration = round((time.time() - s_time)/60, 3)

    print(f'{file_name} is completed, it costs {duration} minutes.')

    return

In [2]:
convert_dataset(val_path)

valid.csv is completed, it costs 0.024 minutes.


In [None]:
convert_dataset(train_path)