In [1]:
import sys
import dawg
import numpy as np
import pandas as pd
from tqdm import tqdm

from utils import load_lines
from utils import dump_lines

from utils import load_pickle
from utils import dump_pickle

from parse import parse_dict

In [2]:
%time data, cols = parse_dict('data/dict.opcorpora.xml')

CPU times: user 1min 21s, sys: 1.37 s, total: 1min 23s
Wall time: 1min 23s


In [3]:
df_types = pd.DataFrame(data['types'], columns=cols['types'])
df_links = pd.DataFrame(data['links'], columns=cols['links'])
df_lemmas = pd.DataFrame(data['lemmas'], columns=cols['lemmas'])
df_grammemes = pd.DataFrame(data['grammemes'], columns=cols['grammemes'])

In [4]:
df_lemmas['tag'] = df_lemmas['tag'] + ' @ ' + df_lemmas['form']
df_lemmas = df_lemmas.drop(columns='form')

In [6]:
df_types.head()

Unnamed: 0,id,type
0,1,ADJF-ADJS
1,2,ADJF-COMP
2,3,INFN-VERB
3,4,INFN-PRTF
4,5,INFN-GRND


In [7]:
df_types.head()

Unnamed: 0,id,type
0,1,ADJF-ADJS
1,2,ADJF-COMP
2,3,INFN-VERB
3,4,INFN-PRTF
4,5,INFN-GRND


In [8]:
df_grammemes.head()

Unnamed: 0,name,alias,parent,description
0,POST,ЧР,,часть речи
1,NOUN,СУЩ,POST,имя существительное
2,ADJF,ПРИЛ,POST,имя прилагательное (полное)
3,ADJS,КР_ПРИЛ,POST,имя прилагательное (краткое)
4,COMP,КОМП,POST,компаратив


In [9]:
df_lemmas.head()

Unnamed: 0,id,tag,word
0,1,"NOUN,anim,masc @ sing,nomn",ёж
1,1,"NOUN,anim,masc @ sing,gent",ежа
2,1,"NOUN,anim,masc @ sing,datv",ежу
3,1,"NOUN,anim,masc @ sing,accs",ежа
4,1,"NOUN,anim,masc @ sing,ablt",ежом


In [40]:
# VERB,PRTF,PRTS,GRND -> INFN
# ADJS,COMP,ADVB -> ADJF sing masc nomn
# NOUN -> NOUN sing masc/femn/neut nomn
# Прочие части речи: всего одна начальная форма

# NOUN(СУЩ), ADJF(ПРИЛ), ADJS(КР_ПРИЛ), COMP(КОМП), 
# VERB(ГЛ), INFN(ИНФ), PRTF(ПРИЧ), PRTS(КР_ПРИЧ), 
# GRND(ДЕЕПР), NUMR(ЧИСЛ), ADVB(Н), NPRO(МС), 
# PRED(ПРЕДК), PREP(ПР), CONJ(СОЮЗ), PRCL(ЧАСТ), INTJ(МЕЖД)

def get_norm_score(tag):

    score = 0

    if ('NOUN' in tag or 
        'INFN' in tag or
        'ADJF' in tag): score += 10

    if 'Qual' in tag: score += 1
    if 'masc' in tag: score += 2
    if 'femn' in tag: score += 1
    if 'sing' in tag: score += 2
    if 'nomn' in tag: score += 2
    if 'impf' in tag: score += 1
    if 'V-ie' in tag: score -= 1
    if 'V-be' in tag: score -= 1
    if 'Supr' in tag: score -= 1
    if 'Infr' in tag: score -= 1
    if 'V-sh' in tag: score -= 1
    if 'Erro' in tag: score -= 1
    if 'Slng' in tag: score -= 1
    if 'Litr' in tag: score -= 1
    if 'Dist' in tag: score -= 1
    
    return score

df_lemmas['norm_score'] = df_lemmas['tag'].apply(get_norm_score)

In [42]:
def get_idx(sets, id):
    for i in range(len(sets)):
        if id in sets[i]:
            return i

def add_link(sets, from_id, to_id, 
             type, excluded_types):

    if type not in excluded_types:
            
        to_idx = get_idx(sets, to_id)
        from_idx = get_idx(sets, from_id)

        if to_idx == from_idx == None:
            sets.append({from_id, to_id})

        elif to_idx != None and from_idx == None:
            sets[to_idx].add(from_id)

        elif from_idx != None and to_idx == None:
            sets[from_idx].add(to_id)

        elif to_idx != from_idx:
            to = sets[to_idx]
            sets[from_idx].update(to)
            del sets[to_idx]

In [43]:
%%time

sets = []
excluded = {7, 21, 23, 27}

for link in tqdm(links, mininterval=1):
    add_link(sets, link[1], link[2], link[3], excluded)

100%|████████████████████████████████████████████████████████████████████████████████| 258650/258650 [29:57<00:00, 143.90it/s]

CPU times: user 29min 52s, sys: 2.1 s, total: 29min 54s
Wall time: 29min 57s





In [44]:
dump_pickle(sets, 'data/links.bin')
sets = load_pickle('data/links.bin')

In [45]:
repl = dict()

for ids in tqdm(sets):
    ids = sorted(list(ids))
    repl.update({ids[i]: ids[0] for i in range(1, len(ids))})

df_lemmas['id'] = df_lemmas['id'].map(repl).fillna(df_lemmas['id']).astype('uint32')

100%|███████████████████████████████████████████████████████████████████████████████| 66195/66195 [00:00<00:00, 400032.21it/s]


In [46]:
df_lemmas['norm_idx'] = df_lemmas.groupby('id')['norm_score'].transform(lambda x: x.idxmax())

In [47]:
def longest_common_substring(data):

    if len(data) == 1:
        return data[0]
    if not data or len(data[0]) == 0:
        return ''
    substr = ''
    for i in range(len(data[0])):
        for j in range(len(data[0])-i+1):
            if j > len(substr) and all(data[0][i:i+j] in x for x in data):
                substr = data[0][i:i+j]
    return substr

def split(group):
    
    forms = group['word'].tolist()
    stem = longest_common_substring(forms)
    prefixes = [form[:form.index(stem)] for form in forms]
    
    suffixes = [
        form[len(pref)+len(stem):]
        for form, pref in zip(forms, prefixes)
    ]

    offsets = [len(pref) for pref in prefixes]
    
    group['prefix'] = prefixes
    group['suffix'] = suffixes
    
    group['stem_offset'] = offsets
    group['stem_length'] = len(stem)
    
    return group

tqdm.pandas()
df_lemmas = df_lemmas.groupby('id', group_keys=False).progress_apply(split)
df_lemmas = df_lemmas.reset_index(drop=True)

100%|████████████████████████████████████████████████████████████████████████████████| 183259/183259 [04:00<00:00, 762.14it/s]


In [48]:
df_lemmas['tag'] = df_lemmas['tag'].astype('category')
df_lemmas['prefix'] = df_lemmas['prefix'].astype('category')
df_lemmas['suffix'] = df_lemmas['suffix'].astype('category')

In [49]:
params = {
    'right_index': True, 
    'left_on': 'norm_idx', 
    'suffixes': ('', '_norm')
}
cols = ['prefix', 'suffix', 'tag']
df_lemmas = pd.merge(df_lemmas, df_lemmas[cols], **params)

In [50]:
df_lemmas.head()

Unnamed: 0,id,tag,word,norm_score,norm_idx,prefix,suffix,stem_offset,stem_length,prefix_norm,suffix_norm,tag_norm
0,1,"NOUN,anim,masc @ sing,nomn",ёж,16,0,ё,,1,1,ё,,"NOUN,anim,masc @ sing,nomn"
1,1,"NOUN,anim,masc @ sing,gent",ежа,14,0,е,а,1,1,ё,,"NOUN,anim,masc @ sing,nomn"
2,1,"NOUN,anim,masc @ sing,datv",ежу,14,0,е,у,1,1,ё,,"NOUN,anim,masc @ sing,nomn"
3,1,"NOUN,anim,masc @ sing,accs",ежа,14,0,е,а,1,1,ё,,"NOUN,anim,masc @ sing,nomn"
4,1,"NOUN,anim,masc @ sing,ablt",ежом,14,0,е,ом,1,1,ё,,"NOUN,anim,masc @ sing,nomn"


In [51]:
def save_categories(series, path):
    dump_lines(series.cat.categories.tolist(), path)

save_categories(df_lemmas['tag'], 'data/tags.txt')
save_categories(df_lemmas['prefix'], 'data/prefixes.txt')
save_categories(df_lemmas['suffix'], 'data/suffixes.txt')

In [54]:
df_links.reset_index(drop=True).to_feather('data/links.ftr')
df_types.reset_index(drop=True).to_feather('data/types.ftr')
df_lemmas.reset_index(drop=True).to_feather('data/lemmas.ftr')
df_grammemes.reset_index(drop=True).to_feather('data/grammemes.ftr')

In [55]:
keys = df_lemmas['word'].copy()

keys += ',' + df_lemmas['tag'].cat.codes.astype(str)
keys += ',' + df_lemmas['stem_offset'].astype(str)
keys += ',' + df_lemmas['stem_length'].astype(str)

keys += ',' + df_lemmas['tag_norm'].cat.codes.astype(str)
keys += ',' + df_lemmas['prefix_norm'].cat.codes.astype(str)
keys += ',' + df_lemmas['suffix_norm'].cat.codes.astype(str)

In [60]:
params = {
    'sep': ',',
    'keys': keys.tolist(),
    'replaces': {'е':'ё'}
}

dwg = dawg.DAWG(**params)
dwg.save('data/words.dawg')

In [1]:
import dawg
import pandas as pd

In [2]:
def load_lines(path):
    with open(path) as fd:
        return fd.read().splitlines()

In [3]:
class MyMorphy:

    def __init__(self):

        params = {
            'sep': ',',
            'replaces': {'е':'ё'},
            'path': 'data/words.dawg'
        }
        
        self.dwg = dawg.DAWG(**params)
        self.tags = load_lines('data/tags.txt')
        self.prefixes = load_lines('data/prefixes.txt')
        self.suffixes = load_lines('data/suffixes.txt')

    def parse(self, key):

        values = key.split(',')
    
        word = values[0]
        tag = self.tags[int(values[1])]
        stem_offset = int(values[2])
        stem_length = int(values[3])
    
        norm_tag = self.tags[int(values[4])]
        norm_prefix = self.prefixes[int(values[5])]
        norm_suffix = self.suffixes[int(values[6])]
    
        stem = word[stem_offset:stem_offset+stem_length]
        norm_word = norm_prefix + stem + norm_suffix
    
        return word, tag, norm_word, norm_tag

    def search(self, word):
        keys = self.dwg.similar_items(word)
        return list(map(self.parse, keys))

In [4]:
morph = MyMorphy()

In [5]:
morph.search('надёжность')

[('надёжность',
  'NOUN,inan,femn @ sing,accs',
  'надёжность',
  'NOUN,inan,femn @ sing,nomn'),
 ('надёжность',
  'NOUN,inan,femn @ sing,nomn',
  'надёжность',
  'NOUN,inan,femn @ sing,nomn')]

In [6]:
morph.dwg.similar_items('надёжность')

['надёжность,2474,0,9,2490,0,11485', 'надёжность,2490,0,9,2490,0,11485']

In [7]:
cols = ['word', 'tag', 'word_norm', 'tag_norm']
pd.DataFrame(morph.search('озера'), columns=cols)

Unnamed: 0,word,tag,word_norm,tag_norm
0,озёра,"NOUN,inan,neut @ plur,accs",озеро,"NOUN,inan,neut @ sing,nomn"
1,озёра,"NOUN,inan,neut @ plur,nomn",озеро,"NOUN,inan,neut @ sing,nomn"
2,озера,"NOUN,inan,neut @ sing,gent",озеро,"NOUN,inan,neut @ sing,nomn"


In [8]:
%%time
for _ in range(140000):
    morph.search('надёжность')

CPU times: user 1.03 s, sys: 592 µs, total: 1.03 s
Wall time: 1.04 s


In [9]:
!du -h data/*

402M	data/dict.opcorpora.xml
8,0K	data/grammemes.ftr
55M	data/lemmas.ftr
1,5M	data/links.bin
2,8M	data/links.ftr
8,0K	data/prefixes.txt
172K	data/suffixes.txt
220K	data/tags.txt
4,0K	data/types.ftr
11M	data/words.dawg
