# 使用Spacy工具包解析W-NUT17数据集

In [1]:
import os,json
import spacy
from tqdm import tqdm
nlp = spacy.load("en_core_web_sm")

In [5]:
path_folder='/home/xhsun/Desktop/gitRepositories/ADP2NER/data/BioNLP13CG-IOB/13cg'

In [6]:
def parsing_sentence(sentence_list):
    sentence=' '.join(sentence_list)
    doc=nlp(sentence)
    result=[]
    for token in doc:
        word=token.text
        deprel=token.dep_.lower()
        head=token.head.text
        pos_tag=token.pos_
        try:
            head_id=sentence_list.index(str(head))
            #容易出现parsing错误的情况
        except:
            return None
        if deprel=='root':
            head_id=0
        else:
            head_id+=1
        result.append([word,pos_tag,head_id,deprel])
    return result

def process_data(path_folder,write_path):
    files_path=os.listdir(path_folder)
    print(files_path)
    for file_name in files_path:
        with open(os.path.join(path_folder,file_name)) as f:
            lines=f.readlines()
        sentences_and_entlabels=[([],[])]
        for line in lines:
            if line.strip() in ['',' ']:
                sentences_and_entlabels.append(([],[]))
            else:
                line_split=line.strip().split()
                assert len(line_split)==2
                word,entity_label=line_split
                sentences_and_entlabels[-1][0].append(word)
                sentences_and_entlabels[-1][1].append(entity_label)
                
        if sentences_and_entlabels[-1]==([],[]):
            del sentences_and_entlabels[-1]
            
        with open(os.path.join(write_path,file_name+".conllx"),'w') as f:
            parsing_error_count=0
            for example in tqdm(sentences_and_entlabels):
                sentences,entlabels=example
                assert len(sentences)==len(entlabels)
                
                parsing_result=parsing_sentence(sentence_list=sentences)
                if parsing_result==None or len(parsing_result)!=len(entlabels):
                    #parsing出现错误
                    parsing_error_count+=1
                    continue
                    
                for i in range(len(parsing_result)):
                    word,pos_tag,head_id,deprel=parsing_result[i]
                    ent=entlabels[i]
                    lemma='_'
                    feats='_'
                    conllx_example=[str(i+1),word,lemma,pos_tag,pos_tag,feats,str(head_id),deprel,'_','_',ent]
                    f.write('\t'.join(conllx_example)+'\n')
                    
                f.write('\n')
            print("parsing error count : ",parsing_error_count,len(sentences_and_entlabels))

In [8]:
process_data(path_folder=path_folder,write_path='/home/xhsun/Desktop/gitRepositories/ADP2NER/data/BioNLP13CG-IOB/13cg')

  1%|▏                                       | 18/3033 [00:00<00:17, 172.25it/s]

['train.txt', 'test.txt']


100%|██████████████████████████████████████| 3033/3033 [00:17<00:00, 168.89it/s]
  1%|▎                                       | 17/1906 [00:00<00:11, 164.09it/s]

parsing error count :  29 3033


100%|██████████████████████████████████████| 1906/1906 [00:11<00:00, 169.82it/s]

parsing error count :  16 1906





In [9]:
{'<PAD>': 0, 'O': 1, 'B-Gene_or_gene_product': 2, 'I-Gene_or_gene_product': 3, 'E-Gene_or_gene_product': 4, 'B-Cancer': 5, 'I-Cancer': 6, 'E-Cancer': 7, 'S-Cancer': 8, 'B-Cell': 9, 'E-Cell': 10, 'S-Gene_or_gene_product': 11, 'S-Cell': 12, 'S-Organism': 13, 'I-Cell': 14, 'S-Simple_chemical': 15, 'B-Simple_chemical': 16, 'I-Simple_chemical': 17, 'E-Simple_chemical': 18, 'S-Multi-tissue_structure': 19, 'B-Multi-tissue_structure': 20, 'E-Multi-tissue_structure': 21, 'S-Organ': 22, 'S-Organism_subdivision': 23, 'B-Tissue': 24, 'I-Tissue': 25, 'E-Tissue': 26, 'S-Tissue': 27, 'S-Immaterial_anatomical_entity': 28, 'S-Organism_substance': 29, 'B-Organism_substance': 30, 'I-Organism_substance': 31, 'E-Organism_substance': 32, 'I-Multi-tissue_structure': 33, 'B-Organism': 34, 'I-Organism': 35, 'E-Organism': 36, 'B-Organism_subdivision': 37, 'E-Organism_subdivision': 38, 'S-Cellular_component': 39, 'B-Immaterial_anatomical_entity': 40, 'I-Immaterial_anatomical_entity': 41, 'E-Immaterial_anatomical_entity': 42, 'B-Cellular_component': 43, 'E-Cellular_component': 44, 'S-Pathological_formation': 45, 'I-Cellular_component': 46, 'B-Pathological_formation': 47, 'I-Pathological_formation': 48, 'E-Pathological_formation': 49, 'B-Organ': 50, 'E-Organ': 51, 'B-Amino_acid': 52, 'I-Amino_acid': 53, 'E-Amino_acid': 54, 'S-Amino_acid': 55, 'B-Anatomical_system': 56, 'E-Anatomical_system': 57, 'S-Anatomical_system': 58, 'I-Anatomical_system': 59, 'S-Developing_anatomical_structure': 60, 'B-Developing_anatomical_structure': 61, 'E-Developing_anatomical_structure': 62, '<START>': 63, '<STOP>': 64}

{'<PAD>': 0,
 'O': 1,
 'B-Gene_or_gene_product': 2,
 'I-Gene_or_gene_product': 3,
 'E-Gene_or_gene_product': 4,
 'B-Cancer': 5,
 'I-Cancer': 6,
 'E-Cancer': 7,
 'S-Cancer': 8,
 'B-Cell': 9,
 'E-Cell': 10,
 'S-Gene_or_gene_product': 11,
 'S-Cell': 12,
 'S-Organism': 13,
 'I-Cell': 14,
 'S-Simple_chemical': 15,
 'B-Simple_chemical': 16,
 'I-Simple_chemical': 17,
 'E-Simple_chemical': 18,
 'S-Multi-tissue_structure': 19,
 'B-Multi-tissue_structure': 20,
 'E-Multi-tissue_structure': 21,
 'S-Organ': 22,
 'S-Organism_subdivision': 23,
 'B-Tissue': 24,
 'I-Tissue': 25,
 'E-Tissue': 26,
 'S-Tissue': 27,
 'S-Immaterial_anatomical_entity': 28,
 'S-Organism_substance': 29,
 'B-Organism_substance': 30,
 'I-Organism_substance': 31,
 'E-Organism_substance': 32,
 'I-Multi-tissue_structure': 33,
 'B-Organism': 34,
 'I-Organism': 35,
 'E-Organism': 36,
 'B-Organism_subdivision': 37,
 'E-Organism_subdivision': 38,
 'S-Cellular_component': 39,
 'B-Immaterial_anatomical_entity': 40,
 'I-Immaterial_anatomi

# 使用Spacy解析

In [3]:
def get_all_sentences(file_path):
    with open(file_path) as f:
        lines=f.readlines()
    sentences_and_entlabels=[([],[])]
    sen_lengths=0
    for line in lines:
        if line.strip() in ['',' ']:
            sentences_and_entlabels.append(([],[]))
        else:
            line_split=line.strip().split()
            assert len(line_split)==2
            word,entity_label=line_split
            sentences_and_entlabels[-1][0].append(word)
            sentences_and_entlabels[-1][1].append(entity_label)

    if sentences_and_entlabels[-1]==([],[]):
        del sentences_and_entlabels[-1]
    for example in sentences_and_entlabels:
        sen_lengths+=len(example[0])
    print("平均长度 : ",sen_lengths/len(sentences_and_entlabels))
    return sentences_and_entlabels

In [20]:
train_sentences_and_entlabels=get_all_sentences('/home/xhsun/Desktop/gitRepositories/ADP2NER/data/W-NUT17/train_prepro_url.txt')
test_sentences_and_entlabels=get_all_sentences('/home/xhsun/Desktop/gitRepositories/ADP2NER/data/W-NUT17/test_prepro_url.txt')

平均长度 :  18.482616381850324
平均长度 :  18.177156177156178


In [21]:
print(len(train_sentences_and_entlabels))
print(len(test_sentences_and_entlabels))

3394
1287


In [22]:
print(train_sentences_and_entlabels[0])

(['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location', 'O', 'B-location', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])


In [23]:
doc=nlp(' '.join(train_sentences_and_entlabels[0][0]))
doc

@paulwalk It 's the view from where I 'm living for two weeks . Empire State Building = ESB . Pretty bad storm here last evening .

In [31]:
print(len(doc),len(train_sentences_and_entlabels[0][1]),len(train_sentences_and_entlabels[0][1]))

28 27 27


**会存在parsing错误导致id对不上的问题**

In [30]:
for token in doc:
    word=token.text
    deprel=token.dep_.lower()
    head=token.head.text
    pos_tag=token.pos_
    print(word,'\t',head,'\t',deprel,'\t',pos_tag)

@paulwalk 	 @paulwalk 	 root 	 PUNCT
It 	 's 	 nsubj 	 PRON
's 	 's 	 root 	 AUX
the 	 view 	 det 	 DET
view 	 's 	 attr 	 NOUN
from 	 view 	 prep 	 ADP
where 	 living 	 advmod 	 ADV
I 	 m 	 poss 	 PRON
' 	 I 	 case 	 PUNCT
m 	 living 	 nsubj 	 X
living 	 from 	 pcomp 	 VERB
for 	 living 	 prep 	 ADP
two 	 weeks 	 nummod 	 NUM
weeks 	 for 	 pobj 	 NOUN
. 	 's 	 punct 	 PUNCT
Empire 	 ESB 	 compound 	 PROPN
State 	 ESB 	 compound 	 PROPN
Building 	 ESB 	 nmod 	 PROPN
= 	 ESB 	 punct 	 PUNCT
ESB 	 ESB 	 root 	 PROPN
. 	 ESB 	 punct 	 PUNCT
Pretty 	 bad 	 advmod 	 ADV
bad 	 storm 	 amod 	 ADJ
storm 	 storm 	 root 	 NOUN
here 	 storm 	 advmod 	 ADV
last 	 evening 	 amod 	 ADJ
evening 	 storm 	 npadvmod 	 NOUN
. 	 storm 	 punct 	 PUNCT


In [7]:
def parsing_sentence(sentence_list,ent_list):
    sentence=' '.join(sentence_list)
    doc=nlp(sentence)
    result=[]
    for i,token in enumerate(doc):
        word=token.text
        deprel=token.dep_.lower()
        head=token.head.text
        pos_tag=token.pos_
        if head not in sentence_list:
            #出现了parsing错误的问题，通常是因为一个词被分成了两个
            if i>=len(ent_list):
                i=len(ent_list)-1
                
            if head+sentence_list[i] in sentence_list:
                head=head+sentence_list[i]
            elif sentence_list[i-1]+head in sentence_list:
                head=sentence_list[i-1]+head
            else:
                head=sentence_list[i]#实在找不到
        head_id=sentence_list.index(str(head))
        if deprel=='root':
            head_id=0
        else:
            head_id+=1
        if i>=len(ent_list):
            i=len(ent_list)-1
        result.append([word,pos_tag,head_id,deprel,ent_list[i]])
    return result

In [8]:
def write_parsing_results(examples,write_path):
    with open(write_path,'w') as f:
        for example in tqdm(examples):
            sentence_list,ent_list=example
            parsing_result=parsing_sentence(sentence_list,ent_list)
            for i in range(len(parsing_result)):
                word,pos_tag,head_id,deprel,ent=parsing_result[i]
                lemma='_'
                feats='_'
                conllx_example=[str(i+1),word,lemma,pos_tag,pos_tag,feats,str(head_id),deprel,'_','_',ent]
                f.write('\t'.join(conllx_example)+'\n')
            f.write('\n')
    print("write over!")


In [None]:

print(len(train_sentences_and_entlabels))
print(len(test_sentences_and_entlabels))
write_parsing_results(examples=train_sentences_and_entlabels,write_path='/home/xhsun/Desktop/gitRepositories/Some-NER-models/data/W-NUT17/Spacy/train.conllx')
write_parsing_results(examples=test_sentences_and_entlabels,write_path='/home/xhsun/Desktop/gitRepositories/Some-NER-models/data/W-NUT17/Spacy/test.conllx')

# 使用Spacy工具包解析NCBI数据集

In [4]:
train_sentences_and_entlabels=get_all_sentences('/home/xhsun/Desktop/gitRepositories/ADP2NER/data/NCBI/train.txt')
test_sentences_and_entlabels=get_all_sentences('/home/xhsun/Desktop/gitRepositories/ADP2NER/data/NCBI/test.txt')

平均长度 :  25.0186209439528
平均长度 :  26.06063829787234


In [5]:
print(len(train_sentences_and_entlabels))
print(len(test_sentences_and_entlabels))

5424
940


In [6]:
print(train_sentences_and_entlabels[0])

(['Identification', 'of', 'APC2', ',', 'a', 'homologue', 'of', 'the', 'adenomatous', 'polyposis', 'coli', 'tumour', 'suppressor', '.'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'O', 'O'])


In [7]:
doc=nlp(' '.join(train_sentences_and_entlabels[0][0]))
doc

Identification of APC2 , a homologue of the adenomatous polyposis coli tumour suppressor .

In [8]:
print(len(doc),len(train_sentences_and_entlabels[0][1]),len(train_sentences_and_entlabels[0][1]))

14 14 14


In [11]:

print(len(train_sentences_and_entlabels))
print(len(test_sentences_and_entlabels))
write_parsing_results(examples=train_sentences_and_entlabels,write_path='/home/xhsun/Desktop/gitRepositories/Some-NER-models/data/NCBI/Spacy/train.conllx')
write_parsing_results(examples=test_sentences_and_entlabels,write_path='/home/xhsun/Desktop/gitRepositories/Some-NER-models/data/NCBI/Spacy/test.conllx')

  0%|▍                                                                                                                                                                  | 14/5424 [00:00<00:39, 136.08it/s]

5424
940


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5424/5424 [00:28<00:00, 189.11it/s]
  2%|███▏                                                                                                                                                                | 18/940 [00:00<00:05, 170.29it/s]

write over!


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 940/940 [00:05<00:00, 186.60it/s]

write over!





# 使用Spacy工具包解析13CG数据集

In [35]:
train_sentences_and_entlabels=get_all_sentences('/home/xhsun/Desktop/gitRepositories/ADP2NER/data/BioNLP13CG-IOB/train.txt')
test_sentences_and_entlabels=get_all_sentences('/home/xhsun/Desktop/gitRepositories/ADP2NER/data/BioNLP13CG-IOB/test.txt')
dev_sentences_and_entlabels=get_all_sentences('/home/xhsun/Desktop/gitRepositories/ADP2NER/data/BioNLP13CG-IOB/dev.txt')

平均长度 :  27.519617540389053
平均长度 :  27.686778593913957
平均长度 :  27.516450648055834


In [36]:
print(len(train_sentences_and_entlabels))
print(len(test_sentences_and_entlabels))
print(len(dev_sentences_and_entlabels))

3033
1906
1003


In [38]:

print(len(train_sentences_and_entlabels))
print(len(test_sentences_and_entlabels))
write_parsing_results(examples=train_sentences_and_entlabels,write_path='/home/xhsun/Downloads/MTL-Bioinformatics-2016-master/data/BioNLP13CG-IOBES/train.conllx')
write_parsing_results(examples=test_sentences_and_entlabels,write_path='/home/xhsun/Downloads/MTL-Bioinformatics-2016-master/data/BioNLP13CG-IOBES/test.conllx')

  1%|▎                                       | 21/3033 [00:00<00:14, 202.25it/s]

3033
1906


100%|██████████████████████████████████████| 3033/3033 [00:14<00:00, 211.70it/s]
  1%|▍                                       | 22/1906 [00:00<00:08, 219.22it/s]

write over!


100%|██████████████████████████████████████| 1906/1906 [00:09<00:00, 207.24it/s]

write over!





In [13]:
label2id={}
for s_l in train_sentences_and_entlabels:
    label_list=s_l[1]
    for tag in label_list:
        if tag not in label2id:
            label2id[tag]=len(label2id)

In [14]:
label2id

{'O': 0,
 'B-Gene_or_gene_product': 1,
 'I-Gene_or_gene_product': 2,
 'B-Cancer': 3,
 'I-Cancer': 4,
 'B-Cell': 5,
 'I-Cell': 6,
 'B-Organism': 7,
 'B-Simple_chemical': 8,
 'I-Simple_chemical': 9,
 'B-Multi-tissue_structure': 10,
 'I-Multi-tissue_structure': 11,
 'B-Organ': 12,
 'B-Organism_subdivision': 13,
 'B-Tissue': 14,
 'I-Tissue': 15,
 'B-Immaterial_anatomical_entity': 16,
 'B-Organism_substance': 17,
 'I-Organism_substance': 18,
 'I-Organism': 19,
 'I-Organism_subdivision': 20,
 'B-Cellular_component': 21,
 'I-Immaterial_anatomical_entity': 22,
 'I-Cellular_component': 23,
 'B-Pathological_formation': 24,
 'I-Pathological_formation': 25,
 'I-Organ': 26,
 'B-Amino_acid': 27,
 'I-Amino_acid': 28,
 'B-Anatomical_system': 29,
 'I-Anatomical_system': 30,
 'B-Developing_anatomical_structure': 31,
 'I-Developing_anatomical_structure': 32}

In [17]:
{'<PAD>': 0, 'O': 1, 'B-Gene_or_gene_product': 2, 'I-Gene_or_gene_product': 3, 'E-Gene_or_gene_product': 4, 'B-Cancer': 5, 'I-Cancer': 6, 'E-Cancer': 7, 'S-Cancer': 8, 'B-Cell': 9, 'E-Cell': 10, 'S-Gene_or_gene_product': 11, 'S-Cell': 12, 'S-Organism': 13, 'I-Cell': 14, 'S-Simple_chemical': 15, 'B-Simple_chemical': 16, 'I-Simple_chemical': 17, 'E-Simple_chemical': 18, 'S-Multi-tissue_structure': 19, 'B-Multi-tissue_structure': 20, 'E-Multi-tissue_structure': 21, 'S-Organ': 22, 'S-Organism_subdivision': 23, 'B-Tissue': 24, 'I-Tissue': 25, 'E-Tissue': 26, 'S-Tissue': 27, 'S-Immaterial_anatomical_entity': 28, 'S-Organism_substance': 29, 'B-Organism_substance': 30, 'I-Organism_substance': 31, 'E-Organism_substance': 32, 'I-Multi-tissue_structure': 33, 'B-Organism': 34, 'I-Organism': 35, 'E-Organism': 36, 'B-Organism_subdivision': 37, 'E-Organism_subdivision': 38, 'S-Cellular_component': 39, 'B-Immaterial_anatomical_entity': 40, 'I-Immaterial_anatomical_entity': 41, 'E-Immaterial_anatomical_entity': 42, 'B-Cellular_component': 43, 'E-Cellular_component': 44, 'S-Pathological_formation': 45, 'I-Cellular_component': 46, 'B-Pathological_formation': 47, 'I-Pathological_formation': 48, 'E-Pathological_formation': 49, 'B-Organ': 50, 'E-Organ': 51, 'B-Amino_acid': 52, 'I-Amino_acid': 53, 'E-Amino_acid': 54, 'S-Amino_acid': 55, 'B-Anatomical_system': 56, 'E-Anatomical_system': 57, 'S-Anatomical_system': 58, 'I-Anatomical_system': 59, 'S-Developing_anatomical_structure': 60, 'B-Developing_anatomical_structure': 61, 'E-Developing_anatomical_structure': 62, '<START>': 63, '<STOP>': 64}

{'<PAD>': 0,
 'O': 1,
 'B-Gene_or_gene_product': 2,
 'I-Gene_or_gene_product': 3,
 'E-Gene_or_gene_product': 4,
 'B-Cancer': 5,
 'I-Cancer': 6,
 'E-Cancer': 7,
 'S-Cancer': 8,
 'B-Cell': 9,
 'E-Cell': 10,
 'S-Gene_or_gene_product': 11,
 'S-Cell': 12,
 'S-Organism': 13,
 'I-Cell': 14,
 'S-Simple_chemical': 15,
 'B-Simple_chemical': 16,
 'I-Simple_chemical': 17,
 'E-Simple_chemical': 18,
 'S-Multi-tissue_structure': 19,
 'B-Multi-tissue_structure': 20,
 'E-Multi-tissue_structure': 21,
 'S-Organ': 22,
 'S-Organism_subdivision': 23,
 'B-Tissue': 24,
 'I-Tissue': 25,
 'E-Tissue': 26,
 'S-Tissue': 27,
 'S-Immaterial_anatomical_entity': 28,
 'S-Organism_substance': 29,
 'B-Organism_substance': 30,
 'I-Organism_substance': 31,
 'E-Organism_substance': 32,
 'I-Multi-tissue_structure': 33,
 'B-Organism': 34,
 'I-Organism': 35,
 'E-Organism': 36,
 'B-Organism_subdivision': 37,
 'E-Organism_subdivision': 38,
 'S-Cellular_component': 39,
 'B-Immaterial_anatomical_entity': 40,
 'I-Immaterial_anatomi