# Import all required packages

In [None]:
!pip install pandas
!pip install wiktionaryparser
!pip install ipapy
!pip install pymorphy2
!pip install natasha
!pip install seqeval

In [3]:
import re
from tqdm import tqdm
import pandas as pd
import itertools
from wiktionaryparser import WiktionaryParser
import re
from ipapy.ipastring import IPAString
import pymorphy2
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    Doc
)
from seqeval.metrics import classification_report

# Processing source file

In [4]:
enm = pd.read_excel('/content/enm1930_ner.xlsx')
enm['tokens'] = enm['tokens'].apply(lambda x: x.replace("'", ''))
enm['tokens'] = enm['tokens'].apply(lambda x: x.replace("[", ''))
enm['tokens'] = enm['tokens'].apply(lambda x: x.replace("]", ''))
enm['tokens'] = enm['tokens'].apply(lambda x: x.replace(",", ''))
enm['tags'] = enm['tags'].apply(lambda x: x.replace("]", ''))
enm['tags'] = enm['tags'].apply(lambda x: x.replace("[", ''))
enm['tags'] = enm['tags'].apply(lambda x: x.replace("'", ''))
enm['tags'] = enm['tags'].apply(lambda x: x.replace(",", ''))
enm['split_split_sent'] = enm['tokens'].apply(lambda x: x.split())
enm['split_ner'] = enm['tags'].apply(lambda x: x.split())

In [5]:
def to_json(x, y):
    global enm_json
    enm_json.append({'sentence': x, 'tags': y})

enm_json = []
enm.apply(lambda x: to_json(x['split_split_sent'], x['split_ner']), axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
463    None
464    None
465    None
466    None
467    None
Length: 468, dtype: object

# Define features

In [7]:
def yakanye(token, parser_ipa):
    try:
        word = parser_ipa.fetch(token, 'russian')
        ipa = word[0]['pronunciations']['text'][0].replace('IPA: ', '')
        regex = r"(\[.*\])"
        matches = re.findall(regex, ipa)[0][1:-1]
        s_ipa = IPAString(unicode_string=matches, ignore=True)
        last_vowel = ''
        if_palatalized_diacritic_before_vowel = False
        s_ipa_list = []
        for c in s_ipa:
            s_ipa_list.append(c.name)
        count_v = 0
        for each in s_ipa_list:
            if 'vowel' in each:
                count_v += 1
        if count_v >1:
            for i in range(len(s_ipa_list)):
                if s_ipa_list[i] != 'primary-stress suprasegmental':
                    if 'vowel' in s_ipa_list[i]:
                        last_vowel = s_ipa_list[i]
                        try:
                            if s_ipa_list[i-1] == 'palatalized diacritic':
                                if_palatalized_diacritic_before_vowel = True
                        except:
                            pass
                else:
                    break
            if last_vowel == 'near-close near-front unrounded vowel' and if_palatalized_diacritic_before_vowel == True:
                return True
            else:
                return False
    except:
        pass


def v(token):
    if 'в' in token:
        return True
    else:
        return False

    
def sh(token):
    if 'щ' in token:
        return True
    else:
        return False

    
def instrumental(p, doc, i):
    head = doc.tokens[i].id
    is_in_PP = False
    for token in doc.tokens:
        if token.head_id == head and token.pos == 'ADP':
            is_in_PP = True
    if p.tag.POS == 'NOUN' and p.tag.case == 'ablt' and p.tag.number == 'plur' and is_in_PP == False:
        return True
    else:
        return False

    
def deixis(p):
    d = ['тот', 'этот', 'там', 'тут', 'здесь']
    if p.normal_form in d:
        return True
    else:
        return False

    
def third(p):
    if p.tag.POS == 'VERB' and p.tag.person == '3per' and (p.tag.tense == 'pres' or p.tag.tense == 'futr') and p.tag.mood == 'indc' and not p.word.endswith('ся'):
        return True
    else:
        return False

def want(p):
    if p.tag.POS == 'VERB' and p.tag.number == 'plur' and p.tag.tense == 'pres' and p.normal_form == 'хотеть':
        return True
    else:
        return False

    
def infinitives(p, token):
    if p.tag.POS == 'INFN' and (token.endswith('ти') or token.endswith('есть')):
        return True
    else:
        return False

    
def postfixum(p, token):
    vow = ['у', 'е', 'ы', 'а', 'о', 'э', 'я', 'и', 'ю']
    if p.tag.POS == 'VERB' and (token.endswith('сь') or token.endswith('ся')) and token[:-2][-1] in vow:
        return True
    else:
        return False
    

def participles(p, token):
    if (p.tag.POS == 'VERB' or p.tag.POS == 'GRND') and ('ши' in token or 'дчи' in token):
        return True
    else:
        return False

    
def participle_agreement_without_aux(doc):
    res = []
    for i in range(len(doc.tokens)):
        ptc = []
        if 'VerbForm' in doc.tokens[i].feats.keys() and doc.tokens[i].feats['VerbForm'] == 'Part' and doc.tokens[i].feats['Voice'] == 'Pass' and 'Gender' in doc.tokens[i].feats.keys():
            ptc.append([i, doc.tokens[i]])
        all_childs = []
        for p in ptc:
            for j in range(len(doc.tokens)):
                if doc.tokens[j].head_id == p[1].id and doc.tokens[j].rel == 'nsubj:pass' and 'Gender' in doc.tokens[j].feats.keys():
                    all_childs.append([p[0], p[1], j, doc.tokens[j]])
        for each in all_childs:
            if each[1].feats['Gender'] != each[3].feats['Gender']:
                res.append([each[0], each[2]])
    return res


def participle_agreement_with_aux(doc):            
    res = []
    for i in range(len(doc.tokens)):
        ptc = []
        if 'VerbForm' in doc.tokens[i].feats.keys() and doc.tokens[i].feats['VerbForm'] == 'Part' and doc.tokens[i].feats['Voice'] == 'Pass' and 'Gender' in doc.tokens[i].feats.keys():
            ptc.append([i, doc.tokens[i]])
        ptc_aux = []
        for p in ptc:
            for j in range(len(doc.tokens)):
                if doc.tokens[j].head_id == p[1].id and doc.tokens[j].rel == 'aux:pass' and 'Gender' in doc.tokens[j].feats.keys():
                    ptc_aux.append([p[0], p[1],  j, doc.tokens[j]])
        all_childs = []
        for p in ptc_aux:
            for k in range(len(doc.tokens)):
                if doc.tokens[k].head_id == p[1].id and doc.tokens[k].rel == 'nsubj:pass' and 'Gender' in doc.tokens[k].feats.keys():
                    all_childs.append([p[0], p[1], p[2], p[3], k, doc.tokens[k]])
        for each in all_childs:
            if each[1].feats['Gender'] != each[3].feats['Gender'] or each[1].feats['Gender'] != each[5].feats['Gender'] or each[3].feats['Gender'] != each[5].feats['Gender']:
                res.append([each[0], each[2], each[4]])
    return res


def govorit_na(doc):
    res = []
    verbs = []
    for i in range(len(doc.tokens)):
        if doc.tokens[i].lemma == 'говорить' or doc.tokens[i].lemma == 'сказать':
            verbs.append([i, doc.tokens[i]])
    for verb in verbs:
        for i in range(len(doc.tokens)):
            if verb[1].id == doc.tokens[i].head_id:
                for j in range(len(doc.tokens)):
                    if doc.tokens[j].head_id == doc.tokens[i].id and doc.tokens[j].lemma == 'на':
                        res.append([verb[0], j])
    return res


def chodit_v(doc):
    res = []
    verbs = []
    verbs_lexeme = ['ходить', 'пойти', 'ездить', 'сходить', 'идти', 'приехать', 
                    'ехать', 'приезжать', 'прийти', 'поехать', 'приходить']
    nouns_lexeme = ['гриб', 'ягода', 'орех', 'клюква', 'дрова', 'малина', 'черника', 'черница', 
                    'сморода', 'смородина', 'брусника', 'брусница', 'земляника', 'земляница']
    for i in range(len(doc.tokens)):
        if doc.tokens[i].lemma in verbs_lexeme:
            verbs.append([i, doc.tokens[i]])
    verb_noun = []
    for verb in verbs:
        for i in range(len(doc.tokens)):
            if doc.tokens[i].head_id == verb[1].id and doc.tokens[i].lemma in nouns_lexeme:
                verb_noun.append([verb[0], verb[1], i, doc.tokens[i]])
    for item in verb_noun:
        for k in range(len(doc.tokens)):
            if item[3].id == doc.tokens[k].head_id and doc.tokens[k].lemma == 'в':
                res.append([item[0], item[2], k])
    return res

# Define an algorithm that invokes checks for the presence of a particular features

In [8]:
def check_features(sent, parser_ipa, morph):
    tokens_ready = {}
    text = ' '.join(sent)
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)

    for i in range(len(sent)):
        res_other = {}
        res = {}
        v_token = v(sent[i])
        if v_token:
            res['фонема /в/'] = v_token
        sh_token = sh(sent[i])
        if sh_token:
            res['длинный [ш]'] = sh_token
        yakanye_token = yakanye(sent[i], parser_ipa)
        if yakanye_token:
            res['яканье'] = yakanye_token
        res_other['фонетика'] = res
        
        p = morph.parse(sent[i])[0]
        
        res = {}
        instrumental_token = instrumental(p, doc, i)
        if instrumental_token:
            res['творительный множественного'] = instrumental_token
        deixis_token = deixis(p)
        if deixis_token:
            res['дейктические'] = deixis_token
        third_token = third(p)
        if third_token:
            res['формы 3 лица презенса'] = third_token
        want_token = want(p)
        if want_token:
            res['формы глагола хотеть'] = want_token
        infinitives_token = infinitives(p, sent[i])
        if infinitives_token:
            res['формы инфинитивов'] = infinitives_token
        postfixum_token = postfixum(p, sent[i])
        if postfixum_token:
            res['возвратные суффиксы'] = postfixum_token
        participles_token = participles(p, sent[i])
        if participles_token:
            res['формы причастий'] = participles_token
        res_other['морфология'] = res
        
        tokens_ready[i] = {sent[i]: res_other}
    
    text = ' '.join(sent)
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    
    part1 = participle_agreement_without_aux(doc)
    part2 = participle_agreement_with_aux(doc)
    gov = govorit_na(doc)
    chod = chodit_v(doc)
    res_synt = {}
        
    if part1 != []:
        res_synt['согласование причастий без связки'] = part1
    if part2 != []:
        res_synt['согласование причастий со связкой'] = part2
    if gov != []:
        res_synt['говорить на'] = gov
    if chod != []:
        res_synt['ходить в'] = chod
    
    return res_synt, tokens_ready



# Function response aggregation

In [9]:
emb = NewsEmbedding()
segmenter = Segmenter()
morph_vocab = MorphVocab()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
parser_ipa = WiktionaryParser()
morph = pymorphy2.MorphAnalyzer()

for item in tqdm(enm_json):
    synt, other = check_features(item['sentence'], parser_ipa, morph)
    res_other_p = []
    res_other_m = []
    for f in range(len(other)):
        p_idx = 'O'
        m_idx = 'O'
        for key, value in other[f].items():
            phon = value['фонетика']
            morphol = value['морфология']
            if phon != {}:
                for key, value in phon.items():
                    if value == True:
                        p_idx = 'B-PHON'
            if morphol != {}:
                for key, value in morphol.items():
                    if value == True:
                        p_idx = 'B-MORPH'
        res_other_p.append(p_idx)  
        res_other_m.append(m_idx)
    final_res_morph = []
    for i in range(len(res_other_p)):
        if res_other_p[i] == 'O' and res_other_m[i] == 'O':
            final_res_morph.append('O')
        elif res_other_p[i] != 'O' and res_other_m[i] == 'O':
            final_res_morph.append(res_other_p[i])
        elif res_other_p[i] == 'O' and res_other_m[i] != 'O':
            final_res_morph.append(res_other_m[i])
        else:
            final_res_morph.append(res_other_m[i])
    final_res_synt = ['O']*len(item['sentence'])
    for key, value in synt.items():
        if len(value) == 1:
            res_value = sorted(value[0])
            final_res_synt[res_value[0]] = 'B-SYNT'
            for i in range(len(res_value[1:])):
                final_res_synt[res_value[i]] = 'I-SYNT'
        if len(value) > 1:
            all_lists = list(set(sum(value, [])))
            for seq in value:
                seq = sorted(seq)
                for i in range(len(seq)):
                    if i == 0 and final_res_synt[seq[i]] != 'I-SYNT':
                        final_res_synt[seq[i]] = 'B-SYNT'
                    else:
                        final_res_synt[seq[i]] = 'I-SYNT'

    final = []
    for i in range(len(final_res_morph)):
        if final_res_morph[i] == 'O' and final_res_synt[i] == 'O':
            final.append('O')
        elif final_res_morph[i] != 'O' and final_res_synt[i] == 'O':
            final.append(final_res_morph[i])
        elif final_res_morph[i] == 'O' and final_res_synt[i] != 'O':
            final.append(final_res_synt[i])
        else:
            final.append(final_res_synt[i])
    item['predicted'] = final
    

100%|█████████████████████████████████████████| 468/468 [52:28<00:00,  6.73s/it]


In [10]:
enm_df = pd.DataFrame.from_records(enm_json)
enm_df.to_excel('/content/enm1930_ner_3.xlsx') 

# Evaluate

In [11]:
true_predictions = []
true_labels = []

for x in enm_json:
    true_predictions.append(x['predicted'])
    true_labels.append(x['tags'])
    
print(classification_report(true_labels, true_predictions))

              precision    recall  f1-score   support

         LEX       0.00      0.00      0.00        42
       MORPH       0.59      0.71      0.64       206
        PHON       0.27      0.71      0.39       248
        SYNT       0.60      0.12      0.21        24

   micro avg       0.35      0.63      0.45       520
   macro avg       0.36      0.39      0.31       520
weighted avg       0.39      0.63      0.45       520



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
true_dial_preds = []
for y in true_predictions:
    r = []
    for x in y:
        if x != 'O' and x[0] == 'B':
            r.append('B-DIAL')
        elif x != 'O' and x[0] == 'I':
            r.append('I-DIAL')
        else:
            r.append('O')
    true_dial_preds.append(r)

true_dial_labels = []
for y in true_labels:
    r = []
    for x in y:
        if x != 'O' and x[0] == 'B':
            r.append('B-DIAL')
        elif x != 'O' and x[0] == 'I':
            r.append('I-DIAL')
        else:
            r.append('O')
    true_dial_labels.append(r)
print(classification_report(true_dial_labels, true_dial_preds))

              precision    recall  f1-score   support

        DIAL       0.38      0.67      0.49       520

   micro avg       0.38      0.67      0.49       520
   macro avg       0.38      0.67      0.49       520
weighted avg       0.38      0.67      0.49       520

