# 处理数据, 转换成 spacy 格式

In [14]:
import os
import json
import spacy
from spacy.tokens import DocBin, Doc
from pprint import pprint
from tqdm import tqdm

In [5]:
train_file = "./data/resume_zh/train.json"
with open(train_file, "r", encoding="utf-8") as f:
    train_data = json.load(f)
print(len(train_data))
pprint(train_data[0])

3819
{'ner': [{'index': [0, 1], 'type': 'NAME'},
         {'index': [5, 6, 7, 8], 'type': 'CONT'}],
 'sentence': ['高',
              '勇',
              '：',
              '男',
              '，',
              '中',
              '国',
              '国',
              '籍',
              '，',
              '无',
              '境',
              '外',
              '居',
              '留',
              '权',
              '，'],
 'word': [[0, 1],
          [2],
          [3],
          [4],
          [5, 6],
          [7, 8],
          [9],
          [10],
          [11, 12],
          [13, 14, 15],
          [16]]}


In [19]:
nlp = spacy.blank("zh")
item = train_data[0]
text = "".join(item["sentence"])
doc: Doc = nlp(text)
entities = []
for label in item["ner"]:
    start = label["index"][0]
    # resume_zh 中的标签都是只包含最后一个字符位置, 这里需要最后一个字符位置 + 1, 类似于 python 的切片
    end = label["index"][-1] + 1
    label = label["type"]
    span = doc.char_span(start, end, label=label, alignment_mode="contract")
    print(span, span.label_)
    entities.append(span)

doc.ents = entities

高勇 NAME
中国国籍 CONT


In [15]:
def convert_file(origin_file, output_file):
    nlp = spacy.blank("zh")
    db = DocBin()

    with open(origin_file, "r", encoding="utf-8") as f:
        data = json.load(f)
        for item in tqdm(data):
            text = "".join(item["sentence"])
            doc: Doc = nlp(text)
            entities = []
            for label in item["ner"]:
                start = label["index"][0]
                # resume_zh 中的标签都是只包含最后一个字符位置, 这里需要最后一个字符位置 + 1, 类似于 python 的切片
                end = label["index"][-1] + 1
                label = label["type"]
                span = doc.char_span(start, end, label=label, alignment_mode="contract")
                entities.append(span)

            doc.ents = entities
            db.add(doc)
    
    db.to_disk(output_file)

In [13]:
os.makedirs("./data/resume_zh_spacy", exist_ok=True)
convert_file("./data/resume_zh/train.json", "./data/resume_zh_spacy/train.spacy")
convert_file("./data/resume_zh/dev.json", "./data/resume_zh_spacy/dev.spacy")
convert_file("./data/resume_zh/test.json", "./data/resume_zh_spacy/test.spacy")

100%|██████████| 3819/3819 [00:00<00:00, 4579.69it/s]
100%|██████████| 463/463 [00:00<00:00, 4543.28it/s]
100%|██████████| 477/477 [00:00<00:00, 2689.39it/s]
