In [1]:
import re
import spacy
from tqdm import tqdm
import os
from spacy import displacy
import xml.etree.cElementTree as ET

# 多语言模型，仅识别实体

In [2]:
nlp = spacy.load("xx_ent_wiki_sm")

In [3]:
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [8]:
def check_lang(lang):
    sentences = []
    titles = []
    count = {'ORG': 0, 'LOC': 0, 'PER': 0, 'MISC': 0, "NoEnt": 0}
    total = 0
    lines = 0
    point = 0
    with open("./WikiTrain19/raw_data/" + lang + ".txt", "r") as f:
        for line in f:
            title, sentence = line.split('\t')
            titles.append(title)
            sentences.append(sentence[:-1])
            lines += 1
            if lines == 1000:
                break
    l = len(sentences)
    doc_raw = ' . '.join(sentences)
    doc = nlp(doc_raw)
    for idx, sent in enumerate(doc.sents):
        flag = False
        if idx == l:
            break
        for ent in sent.ents:
            if titles[idx].lower() in ent.text.lower():
                flag = True
                count[ent.label_] += 1
                point += 1
            break
        if not flag:
            count["NoEnt"] += 1
        total += 1
    print(lang)
    for key in count:
        print(key, count[key])
    print(point/total)

In [9]:
for lang in ["en", "de", "es", "fr", "it", "pt", "ru"]:
    print(lang)
    check_lang(lang)

en
en
ORG 12
LOC 43
PER 33
MISC 17
NoEnt 895
0.105
de
de
ORG 31
LOC 264
PER 153
MISC 32
NoEnt 520
0.48
es
es
ORG 2
LOC 9
PER 2
MISC 0
NoEnt 987
0.013
fr
fr
ORG 1
LOC 13
PER 14
MISC 1
NoEnt 44
0.3972602739726027
it
it
ORG 2
LOC 1
PER 4
MISC 1
NoEnt 365
0.021447721179624665
pt
pt
ORG 0
LOC 25
PER 3
MISC 10
NoEnt 56
0.40425531914893614
ru
ru
ORG 0
LOC 4
PER 0
MISC 4
NoEnt 24
0.25


# 单语言模型，根据依存分析找出主语

In [34]:
for lang, model in zip(["en", "de", "fr", "es", "el"], [
        "en_core_web_sm", "de_core_news_sm", "fr_core_news_sm",
        "es_core_news_sm", "el_core_news_sm"
]):
    print(lang)
    nlp = spacy.load(model)
    count = 0
    right = 0
    for f in os.listdir("./WikiTrain19/full/" + lang):
        tree = ET.ElementTree(file="./WikiTrain19/full/" + lang + "/" + f)
        root = tree.getroot()
        title = root[0].text.lower()
        for sentence in root[1][0].text.split(". "):
            doc = nlp(sentence)
            for token in doc:
                if token.dep_ == "ROOT":
                    root = token
            for chunk in doc.noun_chunks:
                if chunk.root.head == root:
                    result = chunk.text
                    break
            if title in result.lower():
                #                 print(sentence)
                #                 print(title + " | " + result.lower())
                right += 1
        count += 1
    print(right / count)

en
0.5067229106248352
de
0.4895833333333333
fr
0.4
es
0.451171875
el
0.4


In [33]:
for lang, model in zip(
    ["pt", "it", "nl"],
    ["pt_core_news_sm", "it_core_news_sm", "nl_core_news_sm"]):
    print("[" + lang + "]")
    nlp = spacy.load(model)
    count = 0
    right = 0
    for f in os.listdir("./WikiTrain19/full/" + lang):
        tree = ET.ElementTree(file="./WikiTrain19/full/" + lang + "/" + f)
        root = tree.getroot()
        title = root[0].text.lower()
        for sentence in root[1][0].text.split(". "):
            doc = nlp(sentence)
            for token in doc:
                if token.dep_ == "ROOT":
                    root = token
            result = ""
            for ent in doc.ents:
                result = ent.text
                break
            if title in result.lower():
                #                 print(sentence)
                #                 print(title + " | " + result.lower())
                right += 1
        count += 1
    print(right / count)

[pt]
0.43333333333333335
[it]
0.5234657039711191
[nl]
0.5666666666666667


In [6]:
for f in os.listdir("./WikiTrain19/full/en"):
    tree = ET.ElementTree(file="./WikiTrain19/full/en/" + f)
    root = tree.getroot()
    for section in root[2]:
        print(section)
        for t in section:
            print(t)
        print()
    break
    

<Element 'section' at 0x7f5cd44f34f8>
<Element 'header' at 0x7f5cd44f34a8>
<Element 'p' at 0x7f5cd44f3458>
<Element 'p' at 0x7f5cd44f3408>
<Element 'p' at 0x7f5cd44f3098>

<Element 'section' at 0x7f5cd44f31d8>
<Element 'header' at 0x7f5cd44f3188>
<Element 'p' at 0x7f5cd44f3138>
<Element 'p' at 0x7f5cd44f30e8>
<Element 'p' at 0x7f5cd44f3048>

<Element 'section' at 0x7f5cd41cdbd8>
<Element 'header' at 0x7f5cd41cdae8>
<Element 'p' at 0x7f5cd41cda98>
<Element 'p' at 0x7f5cd41cd728>
<Element 'p' at 0x7f5cd41cd778>
<Element 'p' at 0x7f5cd41cd9f8>
<Element 'p' at 0x7f5cd41cda48>
<Element 'p' at 0x7f5cd41cd8b8>
<Element 'section' at 0x7f5cd41cdc28>
<Element 'section' at 0x7f5cd41cdf48>
<Element 'section' at 0x7f5cd41e4f98>

<Element 'section' at 0x7f5cd44cf228>
<Element 'header' at 0x7f5cd44cf278>
<Element 'section' at 0x7f5cd44cf2c8>
<Element 'section' at 0x7f5cd44cf4f8>
<Element 'section' at 0x7f5cd44cf5e8>
<Element 'section' at 0x7f5cd44cf728>
<Element 'section' at 0x7f5cd44cf868>

<Element