# 使用Spacy工具包解析W-NUT17数据集

In [1]:
import os,json
import spacy
from tqdm import tqdm
nlp = spacy.load("en_core_web_sm")

In [2]:
path_folder='/home/xhsun/Desktop/gitRepositories/ADP2NER/data/W-NUT17/'

# 使用Spacy解析

In [3]:
def get_all_sentences(file_path):
    with open(file_path) as f:
        lines=f.readlines()
    sentences_and_entlabels=[([],[])]
    sen_lengths=0
    for line in lines:
        if line.strip() in ['',' ']:
            sentences_and_entlabels.append(([],[]))
        else:
            line_split=line.strip().split()
            assert len(line_split)==2
            word,entity_label=line_split
            sentences_and_entlabels[-1][0].append(word)
            sentences_and_entlabels[-1][1].append(entity_label)

    if sentences_and_entlabels[-1]==([],[]):
        del sentences_and_entlabels[-1]
    for example in sentences_and_entlabels:
        sen_lengths+=len(example[0])
    print("平均长度 : ",sen_lengths/len(sentences_and_entlabels))
    return sentences_and_entlabels

In [20]:
train_sentences_and_entlabels=get_all_sentences('/home/xhsun/Desktop/gitRepositories/ADP2NER/data/W-NUT17/train_prepro_url.txt')
test_sentences_and_entlabels=get_all_sentences('/home/xhsun/Desktop/gitRepositories/ADP2NER/data/W-NUT17/test_prepro_url.txt')

平均长度 :  18.482616381850324
平均长度 :  18.177156177156178


In [21]:
print(len(train_sentences_and_entlabels))
print(len(test_sentences_and_entlabels))

3394
1287


In [22]:
print(train_sentences_and_entlabels[0])

(['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location', 'O', 'B-location', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])


In [23]:
doc=nlp(' '.join(train_sentences_and_entlabels[0][0]))
doc

@paulwalk It 's the view from where I 'm living for two weeks . Empire State Building = ESB . Pretty bad storm here last evening .

In [31]:
print(len(doc),len(train_sentences_and_entlabels[0][1]),len(train_sentences_and_entlabels[0][1]))

28 27 27


**会存在parsing错误导致id对不上的问题**

In [30]:
for token in doc:
    word=token.text
    deprel=token.dep_.lower()
    head=token.head.text
    pos_tag=token.pos_
    print(word,'\t',head,'\t',deprel,'\t',pos_tag)

@paulwalk 	 @paulwalk 	 root 	 PUNCT
It 	 's 	 nsubj 	 PRON
's 	 's 	 root 	 AUX
the 	 view 	 det 	 DET
view 	 's 	 attr 	 NOUN
from 	 view 	 prep 	 ADP
where 	 living 	 advmod 	 ADV
I 	 m 	 poss 	 PRON
' 	 I 	 case 	 PUNCT
m 	 living 	 nsubj 	 X
living 	 from 	 pcomp 	 VERB
for 	 living 	 prep 	 ADP
two 	 weeks 	 nummod 	 NUM
weeks 	 for 	 pobj 	 NOUN
. 	 's 	 punct 	 PUNCT
Empire 	 ESB 	 compound 	 PROPN
State 	 ESB 	 compound 	 PROPN
Building 	 ESB 	 nmod 	 PROPN
= 	 ESB 	 punct 	 PUNCT
ESB 	 ESB 	 root 	 PROPN
. 	 ESB 	 punct 	 PUNCT
Pretty 	 bad 	 advmod 	 ADV
bad 	 storm 	 amod 	 ADJ
storm 	 storm 	 root 	 NOUN
here 	 storm 	 advmod 	 ADV
last 	 evening 	 amod 	 ADJ
evening 	 storm 	 npadvmod 	 NOUN
. 	 storm 	 punct 	 PUNCT


In [9]:
def parsing_sentence(sentence_list,ent_list):
    sentence=' '.join(sentence_list)
    doc=nlp(sentence)
    result=[]
    for i,token in enumerate(doc):
        word=token.text
        deprel=token.dep_.lower()
        head=token.head.text
        pos_tag=token.pos_
        if head not in sentence_list:
            #出现了parsing错误的问题，通常是因为一个词被分成了两个
            if i>=len(ent_list):
                i=len(ent_list)-1
                
            if head+sentence_list[i] in sentence_list:
                head=head+sentence_list[i]
            elif sentence_list[i-1]+head in sentence_list:
                head=sentence_list[i-1]+head
            else:
                head=sentence_list[i]#实在找不到
        head_id=sentence_list.index(str(head))
        if deprel=='root':
            head_id=0
        else:
            head_id+=1
        if i>=len(ent_list):
            i=len(ent_list)-1
        result.append([word,pos_tag,head_id,deprel,ent_list[i]])
    return result

In [10]:
def write_parsing_results(examples,write_path):
    with open(write_path,'w') as f:
        for example in tqdm(examples):
            sentence_list,ent_list=example
            parsing_result=parsing_sentence(sentence_list,ent_list)
            for i in range(len(parsing_result)):
                word,pos_tag,head_id,deprel,ent=parsing_result[i]
                lemma='_'
                feats='_'
                conllx_example=[str(i+1),word,lemma,pos_tag,pos_tag,feats,str(head_id),deprel,'_','_',ent]
                f.write('\t'.join(conllx_example)+'\n')
            f.write('\n')
    print("write over!")


In [None]:

print(len(train_sentences_and_entlabels))
print(len(test_sentences_and_entlabels))
write_parsing_results(examples=train_sentences_and_entlabels,write_path='/home/xhsun/Desktop/gitRepositories/Some-NER-models/data/W-NUT17/Spacy/train.conllx')
write_parsing_results(examples=test_sentences_and_entlabels,write_path='/home/xhsun/Desktop/gitRepositories/Some-NER-models/data/W-NUT17/Spacy/test.conllx')

# 使用Spacy工具包解析NCBI数据集

In [4]:
train_sentences_and_entlabels=get_all_sentences('/home/xhsun/Desktop/gitRepositories/ADP2NER/data/NCBI/train.txt')
test_sentences_and_entlabels=get_all_sentences('/home/xhsun/Desktop/gitRepositories/ADP2NER/data/NCBI/test.txt')

平均长度 :  25.0186209439528
平均长度 :  26.06063829787234


In [5]:
print(len(train_sentences_and_entlabels))
print(len(test_sentences_and_entlabels))

5424
940


In [6]:
print(train_sentences_and_entlabels[0])

(['Identification', 'of', 'APC2', ',', 'a', 'homologue', 'of', 'the', 'adenomatous', 'polyposis', 'coli', 'tumour', 'suppressor', '.'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'O', 'O'])


In [7]:
doc=nlp(' '.join(train_sentences_and_entlabels[0][0]))
doc

Identification of APC2 , a homologue of the adenomatous polyposis coli tumour suppressor .

In [8]:
print(len(doc),len(train_sentences_and_entlabels[0][1]),len(train_sentences_and_entlabels[0][1]))

14 14 14


In [11]:

print(len(train_sentences_and_entlabels))
print(len(test_sentences_and_entlabels))
write_parsing_results(examples=train_sentences_and_entlabels,write_path='/home/xhsun/Desktop/gitRepositories/Some-NER-models/data/NCBI/Spacy/train.conllx')
write_parsing_results(examples=test_sentences_and_entlabels,write_path='/home/xhsun/Desktop/gitRepositories/Some-NER-models/data/NCBI/Spacy/test.conllx')

  0%|▍                                                                                                                                                                  | 14/5424 [00:00<00:39, 136.08it/s]

5424
940


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5424/5424 [00:28<00:00, 189.11it/s]
  2%|███▏                                                                                                                                                                | 18/940 [00:00<00:05, 170.29it/s]

write over!


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 940/940 [00:05<00:00, 186.60it/s]

write over!





In [None]:

def process_data(file_path,write_path):
    with open(file_path) as f:
        lines=f.readlines()
    sentences_and_entlabels=[([],[])]
    for line in lines:
        if line.strip() in ['',' ']:
            sentences_and_entlabels.append(([],[]))
        else:
            line_split=line.strip().split()
            assert len(line_split)==2
            word,entity_label=line_split
            sentences_and_entlabels[-1][0].append(word)
            sentences_and_entlabels[-1][1].append(entity_label)

    if sentences_and_entlabels[-1]==([],[]):
        del sentences_and_entlabels[-1]

    with open(write_path,'w') as f:
        parsing_error_count=0
        for example in tqdm(sentences_and_entlabels):
            sentences,entlabels=example
            assert len(sentences)==len(entlabels)

            parsing_result=parsing_sentence(sentence_list=sentences,entlabels=entlabels)
            if parsing_result==None or len(parsing_result)!=len(entlabels):
                #parsing出现错误
                parsing_error_count+=1
                continue

            for i in range(len(parsing_result)):
                word,pos_tag,head_id,deprel=parsing_result[i]
                ent=entlabels[i]
                lemma='_'
                feats='_'
                conllx_example=[str(i+1),word,lemma,pos_tag,pos_tag,feats,str(head_id),deprel,'_','_',ent]
                f.write('\t'.join(conllx_example)+'\n')

            f.write('\n')
        print("parsing error count : ",parsing_error_count,len(sentences_and_entlabels))