In [2]:
import json

with open('./NIKL_NE_2022_v1.0/SXNE2202211218.json', 'r', encoding='utf-8') as f:
    data_s = json.load(f)

with open('./NIKL_NE_2022_v1.0/NXNE2202211218.json', 'r', encoding='utf-8') as f:
    data_n = json.load(f)

In [72]:
from io import StringIO
import random

class Data:
    def __init__(self, target=None) -> None:
        self.label_set = set(['UNK'])
        self.labels = []
        self.sentences = []
        self.target = target if target else []

def create_data(data: Data, raw_data, filter=True):
    for document in raw_data['document']:
        doc_sentences = document['sentence']

        sent = StringIO()
        label = []
        for words in doc_sentences:
            # sentence
            sent.write(words['form'].replace('|', ''))

            # label
            ne = words['NE']
            words_length = len(words['word'])
            new_ne = []

            if filter:  # filters non target ne labels
                for ne_item in ne:
                    if ne_item['label'] in data.target:
                        new_ne.append(ne_item)
                ne = new_ne
            
            if not ne:
                label.extend(['O'] * words_length)  # add O label

            else:
                ne_iter = iter(ne)
                next_label = next(ne_iter)
                
                try:
                    for i, word in enumerate(words['word'], 1):
                        word_beg = word['begin']
                        word_end = word['end']

                        if next_label['begin'] <= word_end:  # reached label word
                            if word_beg <= next_label['begin']:  # if word start of label
                                label.append(f"{next_label['label']}-B")

                            else:  # label is multiple words, but does not contain begining
                                label.append(f"{next_label['label']}-I")

                            if next_label['end'] <= word_end:  # end of label
                                next_label = next(ne_iter)          
                        else:
                            label.append('O')

                except StopIteration:
                    label.extend(['O'] * (words_length - i))    
  

            # end of sentence
            if words['form'].endswith('.') or words['form'].endswith('?') or words['form'].endswith('!'):
                data.label_set.update(label)

                data.sentences.append(sent.getvalue())
                data.labels.append(' '.join(label))
                sent = StringIO()  # new sentence
                label = []             
            else:
                sent.write(' ')



def write_data(data:Data, output_train, output_test, seed=None):
    with open(output_train, 'w', encoding='utf-8') as f_train, open(output_test, 'w', encoding='utf-8') as f_test:
        if seed:
            random.seed(seed)
        for s, l in zip(data.sentences, data.labels):
            rand = random.random()
            if rand < 0.8:
                f_train.write(f'{s}\t{l}\n')
            else:
                f_test.write(f'{s}\t{l}\n')

def write_labels(data, output_label):
    label_list = list(data.label_set)
    label_list.sort()

    with open(output_label, 'w', encoding='utf-8') as f_label:
        for lab in label_list:
            f_label.write(f'{lab}\n')

In [73]:
target = ['PS_NAME', 'PS_PET', 'OGG_EDUCATION', 'OGG_MEDICAL', 'LCP_COUNTRY', 'LCP_PROVINCE', 'LCP_COUNTY', 'LCP_CITY', 'LCP_CAPITALCITY', 'QT_AGE', 'QT_ADDRESS', 'TMM_DISEASE', 'TMM_DRUG']
data = Data(target=target)

create_data(data, data_s, filter=True)
create_data(data, data_n, filter=True)

write_data(data, './data/train_reduced_label.tsv', './data/test_reduced_label.tsv', seed=42)

In [74]:
write_labels(data, './data/reduced_label.txt')