In [1]:
import os,json
import spacy
from tqdm import tqdm
nlp = spacy.load("en_core_web_sm")

# CoNLL03

In [21]:
path_folder='/home/xhsun/Desktop/gitRepositories/Some-NER-models/data/CoNLL03/en_conll03/'

# 使用Spacy解析

In [5]:
def parsing_sentence(sentence_list):
    sentence=' '.join(sentence_list)
    doc=nlp(sentence)
    result=[]
    for token in doc:
        word=token.text
        deprel=token.dep_.lower()
        head=token.head.text
        pos_tag=token.pos_
        try:
            head_id=sentence_list.index(str(head))
            #容易出现parsing错误的情况
        except:
            return None
        if deprel=='root':
            head_id=0
        else:
            head_id+=1
        result.append([word,pos_tag,head_id,deprel])
    return result

def process_data(path_folder,write_path):
    files_path=os.listdir(path_folder)
    print(files_path)
    for file_name in files_path:
        with open(os.path.join(path_folder,file_name)) as f:
            lines=f.readlines()
        sentences_and_entlabels=[([],[])]
        for line in lines:
            if line.strip() in ['',' ']:
                sentences_and_entlabels.append(([],[]))
            else:
                line_split=line.strip().split()
                assert len(line_split)==2
                word,entity_label=line_split
                sentences_and_entlabels[-1][0].append(word)
                sentences_and_entlabels[-1][1].append(entity_label)
                
        if sentences_and_entlabels[-1]==([],[]):
            del sentences_and_entlabels[-1]
            
        with open(os.path.join(write_path,file_name+".conllx"),'w') as f:
            parsing_error_count=0
            for example in tqdm(sentences_and_entlabels):
                sentences,entlabels=example
                assert len(sentences)==len(entlabels)
                
                parsing_result=parsing_sentence(sentence_list=sentences)
                if parsing_result==None or len(parsing_result)!=len(entlabels):
                    #parsing出现错误
                    parsing_error_count+=1
                    continue
                    
                for i in range(len(parsing_result)):
                    word,pos_tag,head_id,deprel=parsing_result[i]
                    ent=entlabels[i]
                    lemma='_'
                    feats='_'
                    conllx_example=[str(i+1),word,lemma,pos_tag,pos_tag,feats,str(head_id),deprel,'_','_',ent]
                    f.write('\t'.join(conllx_example)+'\n')
                    
                f.write('\n')
            print("parsing error count : ",parsing_error_count,len(sentences_and_entlabels))

In [6]:
process_data(path_folder=path_folder,write_path='/home/xhsun/Desktop/gitRepositories/DP-Relatedwork-BrieflySummarize/data/CoNLL03/')

['train.word.bmes', 'test.word.bmes', 'dev.word.bmes']


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 14041/14041 [00:53<00:00, 260.64it/s]


parsing error count :  4026 14041


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 3453/3453 [00:13<00:00, 256.86it/s]


parsing error count :  979 3453


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 3250/3250 [00:13<00:00, 249.72it/s]

parsing error count :  1030 3250





In [32]:
doc=nlp("Brussels 1996-08-22")

In [36]:
for token in doc:
    print(token,token.head,token.dep_)

Brussels Brussels ROOT
1996 Brussels nummod
- Brussels punct
08 22 nummod
- 22 punct
22 Brussels appos


# 使用Stanza解析

In [4]:
import stanza
nlp = stanza.Pipeline('en', processors = 'tokenize,mwt,pos,lemma,depparse')

2022-01-02 19:03:33 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2022-01-02 19:03:33 INFO: Use device: gpu
2022-01-02 19:03:33 INFO: Loading: tokenize
2022-01-02 19:03:35 INFO: Loading: pos
2022-01-02 19:03:35 INFO: Loading: lemma
2022-01-02 19:03:35 INFO: Loading: depparse
2022-01-02 19:03:35 INFO: Done loading processors!


In [26]:
def parsing_sentence(sentence_list):
    sentence=' '.join(sentence_list)
    doc=nlp(sentence)
    result=[]
    for sent_dict in doc.sentences:
        sent_dict=sent_dict.to_dict()
        for each_dict in sent_dict:
            id_=str(each_dict['id'])
            word=each_dict['text']
            lemma='_'
            pos_tag=each_dict['upos']
            cpostag=each_dict['upos']
            feats='_'
            head_id=str(each_dict['head'])
            deprel=each_dict['deprel']

            result.append([id_,word,lemma,pos_tag,pos_tag,feats,head_id,deprel,'_','_'])
    return result

def process_data(path_folder,write_path):
    files_path=os.listdir(path_folder)
    print(files_path)
    for file_name in files_path:
        with open(os.path.join(path_folder,file_name)) as f:
            lines=f.readlines()
        sentences_and_entlabels=[([],[])]
        for line in lines:
            if line.strip() in ['',' ']:
                sentences_and_entlabels.append(([],[]))
            else:
                line_split=line.strip().split()
                assert len(line_split)==2
                word,entity_label=line_split
                sentences_and_entlabels[-1][0].append(word)
                sentences_and_entlabels[-1][1].append(entity_label)
                
        if sentences_and_entlabels[-1]==([],[]):
            del sentences_and_entlabels[-1]
            
        with open(os.path.join(write_path,file_name+".conllx"),'w') as f:
            parsing_error_count=0
            for example in tqdm(sentences_and_entlabels):
                sentences,entlabels=example
                assert len(sentences)==len(entlabels)
                
                parsing_result=parsing_sentence(sentence_list=sentences)
                if len(parsing_result)!=len(entlabels):
                    #parsing出现错误
                    parsing_error_count+=1
                    continue
                    
                for i in range(len(parsing_result)):
                    parsing_result[i].append(entlabels[i])
                    f.write('\t'.join(parsing_result[i])+'\n')
                    
                f.write('\n')
            print("parsing error count : ",parsing_error_count,len(sentences_and_entlabels))

In [27]:
process_data(path_folder=path_folder,
             write_path='/home/xhsun/Desktop/gitRepositories/Some-NER-models/data/CoNLL03/Stanza')

  0%|                                                                                                                                                                    | 3/14041 [00:00<07:59, 29.30it/s]

['train.word.bmes', 'test.word.bmes', 'dev.word.bmes']


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14041/14041 [07:26<00:00, 31.45it/s]
  0%|▏                                                                                                                                                                    | 5/3453 [00:00<01:25, 40.47it/s]

parsing error count :  2455 14041


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3453/3453 [01:26<00:00, 39.79it/s]
  0%|▏                                                                                                                                                                    | 4/3250 [00:00<01:46, 30.51it/s]

parsing error count :  623 3453


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3250/3250 [01:23<00:00, 38.71it/s]

parsing error count :  698 3250





In [1]:
14041-2455

11586

In [6]:
files_path=os.listdir(path_folder)
print(files_path)
for file_name in files_path:
    with open(os.path.join(path_folder,file_name)) as f:
        lines=f.readlines()
    sentences_and_entlabels=[([],[])]
    for line in lines:
        if line.strip() in ['',' ']:
            sentences_and_entlabels.append(([],[]))
        else:
            line_split=line.strip().split()
            assert len(line_split)==2
            word,entity_label=line_split
            sentences_and_entlabels[-1][0].append(word)
            sentences_and_entlabels[-1][1].append(entity_label)

    if sentences_and_entlabels[-1]==([],[]):
        del sentences_and_entlabels[-1]

['train.word.bmes', 'test.word.bmes', 'dev.word.bmes']


In [8]:
sentences_and_entlabels[-1]

(['--', 'Dhaka', 'Newsroom', '880-2-506363'], ['O', 'B-ORG', 'E-ORG', 'O'])

In [14]:
sentence_list=sentences_and_entlabels[-1][0]
sentence=' '.join(sentence_list)
doc=nlp(sentence)
result=[]
print(sentence)
for sent_dict in doc.sentences:
    sent_dict=sent_dict.to_dict()
    for each_dict in sent_dict:
        id_=str(each_dict['id'])
        word=each_dict['text']
        lemma='_'
        pos_tag=each_dict['upos']
        cpostag=each_dict['upos']
        feats='_'
        head_id=str(each_dict['head'])
        deprel=each_dict['deprel']

        result.append([id_,word,lemma,pos_tag,pos_tag,feats,head_id,deprel,'_','_'])

-- Dhaka Newsroom 880-2-506363


In [15]:
result

[['1', '--', '_', 'PUNCT', 'PUNCT', '_', '3', 'punct', '_', '_'],
 ['2', 'Dhaka', '_', 'PROPN', 'PROPN', '_', '3', 'compound', '_', '_'],
 ['3', 'Newsroom', '_', 'PROPN', 'PROPN', '_', '0', 'root', '_', '_'],
 ['4', '880-2-506363', '_', 'NUM', 'NUM', '_', '3', 'appos', '_', '_']]

In [13]:
len(doc.sentences)

1

In [16]:
len(result)

4

In [17]:
sentence_list

['--', 'Dhaka', 'Newsroom', '880-2-506363']

In [18]:
sentences_and_entlabels[-1][1]

['O', 'B-ORG', 'E-ORG', 'O']