In [1]:
import numpy as np
import os
from sklearn.model_selection import train_test_split
import time

In [2]:
label_dict = {'药物':'DRUG',
              '解剖部位':'BODY',
              '疾病和诊断':'DISEASES',
              '影像检查':'EXAMINATIONS',
              '实验室检验':'TEST',
              '手术':'TREATMENT'}

TRAIN ='./CCKS_2019_Task1/preprocessed_data/train_dataset.txt'
VALID = './CCKS_2019_Task1/preprocessed_data/val_dataset.txt'
TEST = './CCKS_2019_Task1/preprocessed_data/test_dataset.txt'

def sentence2BIOlabel(sentence, label_from_file):
    """ BIO 标注 """
    sentence_label = ['O']*len(sentence) 
    if label_from_file=='':
        return sentence_label
    
    for line in label_from_file.split('\n'):
        
        entity_info = line.strip().split('\t')
        start_index = int(entity_info[1])     
        end_index = int(entity_info[2])      
        entity_label = label_dict[entity_info[3]]      
        # Frist entity: B-xx
        sentence_label[start_index] = 'B-'+entity_label
        # Other: I-xx
        for i in range(start_index+1, end_index):
            sentence_label[i] = 'I-'+entity_label
    return sentence_label

def loadRawData(fileName):
    """加载数据并进行标注"""
    sentence_list = []
    label_list = []

    for file_name in os.listdir(fileName):
    
        if '.DS_Store' == file_name:
            continue

        if 'original' in file_name:
            org_file = fileName + file_name
            lab_file = fileName + file_name.replace('-original', '')

            with open(org_file, encoding='utf-8') as f:
                content = f.read().strip()

            with open(lab_file, encoding='utf-8') as f:
                content_label = f.read().strip()

            sentence_label = sentence2BIOlabel(content, content_label)
            sentence_list.append(content)
            label_list.append(sentence_label)

    return sentence_list, label_list

def Save_data(filename, texts, tags):
  with open(filename, 'w') as f:
    for sent, tag in zip(texts, tags):
        size = len(sent)
        for i in range(size):
            f.write(sent[i])
            f.write('\t')
            f.write(tag[i])
            f.write('\n')

In [3]:
start_time = time.time()

# 训练集
sentence_list, label_list = loadRawData('./CCKS_2019_Task1/data/')
# 测试集
sentence_list_test, label_list_test = loadRawData('./CCKS_2019_Task1/data_test/')


# 切割
words = [list(sent) for sent in sentence_list]
t_words = [list(sent) for sent in sentence_list_test]
tags = label_list
t_tags = label_list_test
train_texts, val_texts, train_tags, val_tags = train_test_split(words, tags, test_size=.2)
test_texts, test_tags = t_words, t_tags

# 存储
Save_data(TRAIN, train_texts, train_tags)
Save_data(VALID, val_texts, val_tags)
Save_data(TEST, test_texts, test_tags)

end_time = time.time()
training_time = (end_time - start_time)/60
print(f"Total  time: {training_time:.2f} minutes")

Total  time: 0.11 minutes


In [12]:
pip install transformers==3.4

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple/
Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install pytorch-crf

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple/
Note: you may need to restart the kernel to use updated packages.


In [14]:
pip install protobuf==3.19.0

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple/
Note: you may need to restart the kernel to use updated packages.


In [15]:
pip install tensorflow

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple/
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/15/db/7f731524fe0e56c6b2eb57d05b55d3badd80ef7d1f1ed59db191b2fdd8ab/protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.19.0
    Uninstalling protobuf-3.19.0:
      Successfully uninstalled protobuf-3.19.0
Successfully installed protobuf-4.25.3
Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install protobuf==3.19.0

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple/
Collecting protobuf==3.19.0
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/69/ee/949eb6182636fdc4fa0e2fa02a94d79e21069d46b56d4f251d0ac39b5678/protobuf-3.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.25.3
    Uninstalling protobuf-4.25.3:
      Successfully uninstalled protobuf-4.25.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.13.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.19.0 which is incompatible.
tensorboard 2.13.0 requires protobuf>=3.19.6, but you have protobuf 3.19.0 which is incompatible.[0m
Successfully installed protobuf-3.19.0
Note: you may need

In [None]:
!python main.py --n_epochs 30