# Import all required packages

In [None]:
!pip install pandas
!pip install wiktionaryparser
!pip install ipapy
!pip install pymorphy2
!pip install natasha
!pip install seqeval

In [1]:
import pandas as pd
from tqdm import tqdm
import itertools
from wiktionaryparser import WiktionaryParser
import re
from ipapy.ipastring import IPAString
import pymorphy2
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    Doc
)
from seqeval.metrics import classification_report

# Processing source file

In [2]:
mga = pd.read_excel('/content/mga_ner.xlsx')

In [3]:
mga['split_sent'] = mga['split_sent'].apply(lambda x: x.replace("'", ''))
mga['split_sent'] = mga['split_sent'].apply(lambda x: x.replace("[", ''))
mga['split_sent'] = mga['split_sent'].apply(lambda x: x.replace("]", ''))
mga['split_sent'] = mga['split_sent'].apply(lambda x: x.replace(",", ''))
mga['ner'] = mga['ner'].apply(lambda x: x.replace("]", ''))
mga['ner'] = mga['ner'].apply(lambda x: x.replace("[", ''))
mga['ner'] = mga['ner'].apply(lambda x: x.replace("'", ''))
mga['ner'] = mga['ner'].apply(lambda x: x.replace(",", ''))
mga['split_split_sent'] = mga['split_sent'].apply(lambda x: x.split())
mga['split_ner'] = mga['ner'].apply(lambda x: x.split())

In [4]:
def to_json(x, y):
    global mga_json
    mga_json.append({'sentence': x, 'tags': y})

mga_json = []
mga.apply(lambda x: to_json(x['split_split_sent'], x['split_ner']), axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
1439    None
1440    None
1441    None
1442    None
1443    None
Length: 1444, dtype: object

# Define features

In [5]:
def yakanye(token, parser_ipa):
    try:
        word = parser_ipa.fetch(token, 'russian')
        ipa = word[0]['pronunciations']['text'][0].replace('IPA: ', '')
        regex = r"(\[.*\])"
        matches = re.findall(regex, ipa)[0][1:-1]
        s_ipa = IPAString(unicode_string=matches, ignore=True)
        last_vowel = ''
        if_palatalized_diacritic_before_vowel = False
        s_ipa_list = []
        for c in s_ipa:
            s_ipa_list.append(c.name)
        count_v = 0
        for each in s_ipa_list:
            if 'vowel' in each:
                count_v += 1
        if count_v >1:
            for i in range(len(s_ipa_list)):
                if s_ipa_list[i] != 'primary-stress suprasegmental':
                    if 'vowel' in s_ipa_list[i]:
                        last_vowel = s_ipa_list[i]
                        try:
                            if s_ipa_list[i-1] == 'palatalized diacritic':
                                if_palatalized_diacritic_before_vowel = True
                        except:
                            pass
                else:
                    break
            if last_vowel == 'near-close near-front unrounded vowel' and if_palatalized_diacritic_before_vowel == True:
                return True
            else:
                return False
    except:
        pass


def v(token):
    if 'в' in token:
        return True
    else:
        return False

    
def sh(token):
    if 'щ' in token:
        return True
    else:
        return False

    
def instrumental(p):
    if p.tag.POS == 'NOUN' and p.tag.case == 'ablt' and p.tag.number == 'plur':
        return True
    else:
        return False

    
def deixis(p):
    d = ['тот', 'этот', 'там', 'тут', 'здесь']
    if p.normal_form in d:
        return True
    else:
        return False

    
def third(p):
    if p.tag.POS == 'VERB' and p.tag.person == '3per' and p.tag.tense == 'pres':
        return True
    else:
        return False

def want(p):
    if p.tag.POS == 'VERB' and p.tag.number == 'plur' and p.tag.tense == 'pres' and p.normal_form == 'хотеть':
        return True
    else:
        return False

    
def infinitives(p, token):
    if p.tag.POS == 'INFN' and (token.endswith('ти') or token.endswith('есть')):
        return True
    else:
        return False

    
def postfixum(p, token):
    vow = ['у', 'е', 'ы', 'а', 'о', 'э', 'я', 'и', 'ю']
    if p.tag.POS == 'VERB' and (token.endswith('cь') or token.endswith('cя')) and token[:-2][-1] in vow:
        return True
    else:
        return False
    

def participles(p, token):
    if (p.tag.POS == 'VERB' or p.tag.POS == 'GRND') and ('вши' in token or 'лши' in token or 'дчи' in token):
        return True
    else:
        return False

    
def participle_agreement_without_aux(doc):
    res = []
    for i in range(len(doc.tokens)):
        ptc = []
        if 'VerbForm' in doc.tokens[i].feats.keys() and doc.tokens[i].feats['VerbForm'] == 'Part' and doc.tokens[i].feats['Voice'] == 'Pass':
            ptc.append([i, doc.tokens[i]])
        all_childs = []
        for p in ptc:
            for j in range(len(doc.tokens)):
                if doc.tokens[j].head_id == p[1].id and doc.tokens[j].rel == 'nsubj:pass' and 'Gender' in doc.tokens[j].feats.keys():
                    all_childs.append([p[0], p[1], j, doc.tokens[j]])
        for each in all_childs:
            if each[1].feats['Gender'] != each[3].feats['Gender']:
                res.append([each[0], each[2]])
    return res


def participle_agreement_with_aux(doc):            
    res = []
    for i in range(len(doc.tokens)):
        ptc = []
        if 'VerbForm' in doc.tokens[i].feats.keys() and doc.tokens[i].feats['VerbForm'] == 'Part' and doc.tokens[i].feats['Voice'] == 'Pass':
            ptc.append([i, doc.tokens[i]])
        ptc_aux = []
        for p in ptc:
            for j in range(len(doc.tokens)):
                if doc.tokens[j].head_id == p[1].id and doc.tokens[j].rel == 'aux:pass':
                    ptc_aux.append([p[0], p[1],  j, doc.tokens[j]])
        all_childs = []
        for p in ptc_aux:
            for k in range(len(doc.tokens)):
                if doc.tokens[k].head_id == p[1].id and doc.tokens[k].rel == 'nsubj:pass' and 'Gender' in doc.tokens[k].feats.keys():
                    all_childs.append([p[0], p[1], p[2], p[3], k, doc.tokens[k]])
        for each in all_childs:
            if each[1].feats['Gender'] != each[3].feats['Gender'] or each[1].feats['Gender'] != each[5].feats['Gender'] or each[3].feats['Gender'] != each[5].feats['Gender']:
                res.append([each[0], each[2], each[4]])
    return res


def nominative_obj(doc):
    res = []
    for i in range(len(doc.tokens)):
        need = []
        if doc.tokens[i].lemma == 'надо':
            need.append([i, doc.tokens[i]])
        need_verb = []
        for p in need:
            for j in range(len(doc.tokens)):
                if doc.tokens[j].head_id == p[1].id and doc.tokens[j].rel == 'csubj':
                    need_verb.append([i, p[1], j, doc.tokens[j]])
        all_childs = []
        for p in need_verb:
            for k in range(len(doc.tokens)):
                if doc.tokens[k].head_id == p[1].id and doc.tokens[k].rel == 'nsubj':
                    all_childs.append([p[0], p[2], k])

        if all_childs != []:
            res.append(all_childs)
    return res


def govorit_na(doc):
    res = []
    verbs = []
    for i in range(len(doc.tokens)):
        if doc.tokens[i].lemma == 'говорить':
            verbs.append([i, doc.tokens[i]])
    for verb in verbs:
        for i in range(len(doc.tokens)):
            if verb[1].id == doc.tokens[i].head_id:
                for j in range(len(doc.tokens)):
                    if doc.tokens[j].head_id == doc.tokens[i].id and doc.tokens[j].lemma == 'на':
                        res.append([verb[0], j])
    return res


def chodit_v(doc):
    res = []
    verbs = []
    for i in range(len(doc.tokens)):
        if doc.tokens[i].lemma == 'ходить':
            verbs.append([i, doc.tokens[i]])
    for verb in verbs:
        for i in range(len(doc.tokens)):
            if verb[1].id == doc.tokens[i].head_id:
                for j in range(len(doc.tokens)):
                    if doc.tokens[j].head_id == doc.tokens[i].id and doc.tokens[j].lemma == 'в':
                        res.append([verb[0], j])
    return res


def plusquamperfect(doc):
    res = []
    verbs = []
    for i in range(len(doc.tokens)):
        if doc.tokens[i].pos == 'VERB':
            verbs.append([i, doc.tokens[i]])
    all_children = []
    for verb in verbs:
        for i in range(len(doc.tokens)):
            if verb[1].id == doc.tokens[i].head_id and doc.tokens[i].rel == 'cop':
                res.append([i, verb[0]])
    return res

# Define an algorithm that invokes checks for the presence of a particular features

In [140]:
def check_features(sent, parser_ipa, morph):
    tokens_ready = {}
    for i in range(len(sent)):
        res_other = {}
        res = {}
        v_token = v(sent[i])
        if v_token:
            res['фонема /в/'] = v_token
        sh_token = sh(sent[i])
        if sh_token:
            res['длинный [ш]'] = sh_token
        yakanye_token = yakanye(sent[i], parser_ipa)
        if yakanye_token:
            res['яканье'] = yakanye_token
        res_other['фонетика'] = res
        
        p = morph.parse(sent[i])[0]
        
        res = {}
        instrumental_token = instrumental(p)
        if instrumental_token:
            res['творительный множественного'] = instrumental_token
        deixis_token = deixis(p)
        if deixis_token:
            res['дейктические'] = deixis_token
        third_token = third(p)
        if third_token:
            res['формы 3 лица презенса'] = third_token
        want_token = want(p)
        if want_token:
            res['формы глагола хотеть'] = want_token
        infinitives_token = infinitives(p, sent[i])
        if infinitives_token:
            res['формы инфинитивов'] = infinitives_token
        postfixum_token = postfixum(p, sent[i])
        if postfixum_token:
            res['возвратные суффиксы'] = postfixum_token
        participles_token = participles(p, sent[i])
        if participles_token:
            res['формы причастий'] = participles_token
        res_other['морфология'] = res
        
        tokens_ready[i] = {sent[i]: res_other}
    
    text = ' '.join(sent)[1:]
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    
    part1 = participle_agreement_without_aux(doc)
    part2 = participle_agreement_with_aux(doc)          
    nom = nominative_obj(doc)
    gov = govorit_na(doc)
    chod = chodit_v(doc)
    pqpf = plusquamperfect(doc)
    
    res_synt = {}
    
    if part1 != []:
        res_synt['согласование причастий без связки'] = part1
    if part2 != []:
        res_synt['согласование причастий со связкой'] = part2
    if nom != []:
        res_synt['номинативный объект'] = nom
    if gov != []:
        res_synt['говорить на'] = gov
    if chod != []:
        res_synt['ходить в'] = chod
    if pqpf != []:
        res_synt['плюсквамперфект'] = pqpf
    
    return res_synt, tokens_ready


# Function response aggregation

In [159]:
emb = NewsEmbedding()
segmenter = Segmenter()
morph_vocab = MorphVocab()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
parser_ipa = WiktionaryParser()
morph = pymorphy2.MorphAnalyzer()

for item in tqdm(mga_json):
    synt, other = check_features(item['sentence'], parser_ipa, morph)
    res_other_p = []
    res_other_m = []
    for f in range(len(other)):
        p_idx = 'O'
        m_idx = 'O'
        for key, value in other[f].items():
            phon = value['фонетика']
            morphol = value['морфология']
            if phon != {}:
                for key, value in phon.items():
                    if value == True:
                        p_idx = 'B-PHON'
            if morphol != {}:
                for key, value in morphol.items():
                    if value == True:
                        p_idx = 'B-MORPH'
        res_other_p.append(p_idx)  
        res_other_m.append(m_idx)
    final_res_morph = []
    for i in range(len(res_other_p)):
        if res_other_p[i] == 'O' and res_other_m[i] == 'O':
            final_res_morph.append('O')
        elif res_other_p[i] != 'O' and res_other_m[i] == 'O':
            final_res_morph.append(res_other_p[i])
        elif res_other_p[i] == 'O' and res_other_m[i] != 'O':
            final_res_morph.append(res_other_m[i])
        else:
            final_res_morph.append(res_other_m[i])
    final_res_synt = ['O']*len(item['sentence'])
    for key, value in synt.items():
        if len(value) == 1:
            res_value = value[0].sort()
            final_res_synt[value[0][0]] = 'B-SYNT'
            for i in range(value[0][0]+1, value[0][1]+1):
                final_res_synt[i] = 'I-SYNT'
        if len(value) > 1:
            all_lists = list(set(sum(value, [])))
            for seq in value:
                seq = sorted(seq)
                for i in range(len(seq)):
                    if i == 0 and final_res_synt[seq[i]] == 'O':
                        final_res_synt[seq[i]] = 'B-SYNT'
                    else:
                        final_res_synt[seq[i]] = 'I-SYNT'
                
    final = []
    for i in range(len(final_res_morph)):
        if final_res_morph[i] == 'O' and final_res_synt[i] == 'O':
            final.append('O')
        elif final_res_morph[i] != 'O' and final_res_synt[i] == 'O':
            final.append(final_res_morph[i])
        elif final_res_morph[i] == 'O' and final_res_synt[i] != 'O':
            final.append(final_res_synt[i])
        else:
            final.append(final_res_synt[i])
    item['predicted'] = final
    

100%|██████████████████████████████████████████████████████████████████████████████| 1444/1444 [45:52<00:00,  1.91s/it]


# Test and evaluate

In [163]:
true_predictions = []
true_labels = []

for x in mga_json:
    true_predictions.append(x['predicted'])
    true_labels.append(x['tags'])
    
print(classification_report(true_labels, true_predictions))

              precision    recall  f1-score   support

         LEX       0.00      0.00      0.00         7
       MORPH       0.41      0.52      0.46       182
        PHON       0.07      0.51      0.13       156
        SYNT       0.00      0.00      0.00        13

   micro avg       0.13      0.49      0.21       358
   macro avg       0.12      0.26      0.15       358
weighted avg       0.24      0.49      0.29       358



  _warn_prf(average, modifier, msg_start, len(result))


In [165]:
true_dial_preds = []
for y in true_predictions:
    r = []
    for x in y:
        if x != 'O' and x[0] == 'B':
            r.append('B-DIAL')
        elif x != 'O' and x[0] == 'I':
            r.append('I-DIAL')
        else:
            r.append('O')
    true_dial_preds.append(r)

true_dial_labels = []
for y in true_labels:
    r = []
    for x in y:
        if x != 'O' and x[0] == 'B':
            r.append('B-DIAL')
        elif x != 'O' and x[0] == 'I':
            r.append('I-DIAL')
        else:
            r.append('O')
    true_dial_labels.append(r)
print(classification_report(true_dial_labels, true_dial_preds))

              precision    recall  f1-score   support

        DIAL       0.15      0.55      0.23       357

   micro avg       0.15      0.55      0.23       357
   macro avg       0.15      0.55      0.23       357
weighted avg       0.15      0.55      0.23       357

