In [127]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
from pymorphy2.tokenizers import simple_word_tokenize
from pymystem3 import Mystem
m = Mystem()
import natasha
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc
)
import nltk
import spacy
from flair.data import Sentence
from flair.models import SequenceTagger
from sklearn.metrics import accuracy_score

### Русский
Текст для проверки качества состоит из предложений, содержащий следующие проблемные места для разметки:
* Различные омонимы: шум стрих, почитать стих
* Аббревиатуры, сокращения: Росагроэкспорт, МГУ, т.е.
* Имена собственные: необычные имена и названия
* Сложные слова: мегапроизводительный, двадцатичетырехчасовой
* Редкие формы слов: странные императивы и деепричастия
* Редкие слова: заимстовования, сленг, диалектные слова

Откроем ru_text - текст без моей разметки

In [128]:
with open('russian-test.txt', 'r', encoding='utf-8') as f:
    ru_text = f.read()

Функция для извлечения правильных ответов разметки

In [129]:
def get_mark_answers(filename):
    answer = []
    with open(filename, 'r', encoding='utf-8') as f:
        marked = f.read()
    lines = marked.split('\n')
    for line in lines:
        answer.append(line.split('\t')[1])
    return answer

In [130]:
ru_answer = get_mark_answers('ru_marked.txt')

Функция создает словарь соотвествий тегов, чтобы потом все унифицировать

In [131]:
def get_all_pos_names(filename):
    pos_names_dict = {}
    with open(filename, 'r', encoding='utf-8') as f:
        pos_names_file = f.read()
    pos_names = pos_names_file.split('\n')
    for name in pos_names:
        parts = name.split('\t')
        for pos in parts[1].split(','):
            pos_names_dict[pos] = parts[0]
    return pos_names_dict

In [132]:
pos_names_ru = get_all_pos_names('pos_names.txt')

Функции ниже унифицируют теги. Две из них для определенных систем разметки, потому что там есть неоднозначности, а последняя функция пригодится больше одного раза.

In [133]:
def unify_pos_mystem(predict_pos, pos_names):
    for i in range(len(predict_pos)):
        pos = predict_pos[i].split(',')[0].split('=')[0]
        
        if 'cравн' in predict_pos[i]:
            predict_pos[i] = 'COMP'
        elif 'прич' in predict_pos[i]:
            predict_pos[i] = 'ADJ'
        elif pos in pos_names.keys():
            predict_pos[i] =  pos_names[pos]
        else:
            predict_pos[i] = pos
    return predict_pos

In [134]:
def unify_pos_natasha(predict_pos, pos_names):
    for i in range(len(predict_pos)):
        pos = predict_pos[i].pos
        
        if 'Cmp' in predict_pos[i].feats.values():
            predict_pos[i] = 'COMP'
        elif predict_pos[i].pos == 'PRON' and 'Gender' in predict_pos[i].feats and 'Person' not in predict_pos[i].feats and 'Animacy' not in predict_pos[i].feats:
            predict_pos[i] = 'ADJ'
        elif pos in pos_names.keys():
            predict_pos[i] =  pos_names[pos]
        else:
            predict_pos[i] = pos
    return predict_pos

In [135]:
def simple_unify_pos(predict_pos, pos_names):
    for i in range(len(predict_pos)):
        if predict_pos[i] in pos_names.keys():
            predict_pos[i] =  pos_names[predict_pos[i]]
    return predict_pos

#### Pymorphy

In [136]:
tokens = simple_word_tokenize(ru_text)

In [137]:
pymorphy_predict = []

for word in tokens:
    if word[0].isalpha():
        pymorphy_predict.append(str(morph.parse(word)[0].tag.POS))

In [139]:
unified_pymorphy_predict = simple_unify_pos(pymorphy_predict, pos_names_ru)

In [140]:
print("Pymorphy accuracy: %.4f" % accuracy_score(unified_pymorphy_predict, ru_answer))

Pymorphy accuracy: 0.8832


#### Mystem

Майстемовский токенизатор плохо обращается с дефисами, поэтому пришлось как-то регулировать количество ответов.

In [149]:
leftover = 'то'
mystem_predict = []
ana = m.analyze(ru_text)
for word in ana:
    if word != {'text': ' '} and word['text'][0].isalpha() and word['text'] != leftover:
        if len(word['analysis']) == 0:
            mystem_predict.append('None')
        else:
            gram = word['analysis'][0]['gr']
            mystem_predict.append(gram)

In [150]:
unified_mystem_predict = unify_pos_mystem(mystem_predict, pos_names_ru)

In [151]:
print("Mystem accuracy: %.4f" % accuracy_score(unified_mystem_predict, ru_answer))

Mystem accuracy: 0.8613


#### Natasha

In [152]:
segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)

names_extractor = NamesExtractor(morph_vocab)

In [153]:
doc = Doc(text)
doc.segment(segmenter)
doc.tag_morph(morph_tagger)

In [154]:
unified_natasha_predict = unify_pos_natasha(doc.tokens, pos_names_ru)

In [155]:
print("Accuracy: %.4f" % accuracy_score(unified_natasha_predict, ru_answer))

Accuracy: 0.7810


### Английский
Текст для проверки разметки английского в основном состоит из большого количества разных омонимов (пары прил.-сущ., гл.-сущ, ing формы и другое). 
Еще я добавила в текст аббревиатуры, заимствованные слова, сленг.

In [156]:
with open('english-test.txt', 'r', encoding='utf-8') as f:
    en_text = f.read()

In [157]:
with open('en_answer.txt', 'r', encoding='utf-8') as f:
    en_marked = f.read()
    
en_answer = []

lines = en_marked.split('\n')
for line in lines:
    en_answer.append(line.split('\t')[1])

In [158]:
with open('en_pos_names.txt', 'r', encoding='utf-8') as f:
    en_pos_names = f.read()

In [159]:
en_pos_names = get_all_pos_names('en_pos_names.txt')

#### NLTK

In [160]:
tokens=nltk.word_tokenize(en_text)
nltk_pos = nltk.pos_tag(tokens)

In [161]:
nltk_predict = []
for pos in nltk_pos:
    if pos[0][0].isalpha():
        nltk_predict.append(pos[1])

In [162]:
unified_nltk_predict = simple_unify_pos(nltk_predict, en_pos_names)

In [163]:
print("NLTK accuracy: %.4f" % accuracy_score(unified_nltk_predict, en_answer))

NLTK accuracy: 0.8324


#### Spacy

In [164]:
nlp = spacy.load("en_core_web_sm")

In [165]:
spacy_predict = []
doc = nlp(en_text)
for s in doc.sents:
    for t in s:
        if t.text[0].isalpha():
            spacy_predict.append(t.pos_)

In [166]:
unified_spacy_predict = simple_unify_pos(spacy_predict, en_pos_names)

In [167]:
print("Spacy accuracy: %.4f" % accuracy_score(unified_spacy_predict, en_answer))

Spacy accuracy: 0.9017


#### Flair

In [168]:
tagger = SequenceTagger.load('pos')

2020-10-11 19:56:06,678 loading file C:\Users\Yana\.flair\models\en-pos-ontonotes-v0.5.pt


In [169]:
text = Sentence(en_text)
tagger.predict(text)
tagged_text = text.to_tagged_string()

In [170]:
flair_predict = []
pairs = tagged_text.split('> ')
for pair in pairs:
    part = pair.split()
    if part[0][0].isalpha():
          flair_predict.append(part[1][1:])

In [171]:
unified_flair_predict = simple_unify_pos(flair_predict, en_pos_names)

In [172]:
print("Accuracy: %.4f" % accuracy_score(unified_flair_predict, en_answer))

Accuracy: 0.8960
