In [1]:
import dawg
import pandas as pd
from utils import load_lines

In [2]:
class Morph:

    def __init__(self, data_dir):

        params = {
            'sep': ',', 
            'replaces': {'е':'ё'},
            'path': f'{data_dir}/words.dawg'
        }
        
        self.dwg = dawg.DAWG(**params)
        self.tags = load_lines(f'{data_dir}/tags.txt')
        self.prefixes = load_lines(f'{data_dir}/prefixes.txt')
        self.suffixes = load_lines(f'{data_dir}/suffixes.txt')

    def parse(self, key):

        values = key.split(',')
    
        word = values[0]
        tag = self.tags[int(values[1])]
        stem_offset = int(values[2])
        stem_length = int(values[3])
    
        norm_tag = self.tags[int(values[4])]
        norm_prefix = self.prefixes[int(values[5])]
        norm_suffix = self.suffixes[int(values[6])]
        cnt = int(values[7])
    
        stem = word[stem_offset:stem_offset+stem_length]
        norm_word = norm_prefix + stem + norm_suffix
    
        return word, tag, norm_word, norm_tag, cnt

    def search(self, word):
        keys = self.dwg.similar_items(word)
        return list(map(self.parse, keys))

In [3]:
morph = Morph('data')

In [4]:
morph.search('надёжность')

[('надёжность',
  'NOUN,inan,femn @ sing,accs',
  'надёжность',
  'NOUN,inan,femn @ sing,nomn',
  103),
 ('надёжность',
  'NOUN,inan,femn @ sing,nomn',
  'надёжность',
  'NOUN,inan,femn @ sing,nomn',
  103)]

In [5]:
morph.dwg.similar_items('надёжность')

['надёжность,2474,0,9,2490,0,11485,103',
 'надёжность,2490,0,9,2490,0,11485,103']

In [10]:
cols = ['word','tag','norm','tag_norm','cnt']
pd.DataFrame(morph.search('озера'), columns=cols)

Unnamed: 0,word,tag,norm,tag_norm,cnt
0,озёра,"NOUN,inan,neut @ plur,accs",озеро,"NOUN,inan,neut @ sing,nomn",110
1,озёра,"NOUN,inan,neut @ plur,nomn",озеро,"NOUN,inan,neut @ sing,nomn",110
2,озера,"NOUN,inan,neut @ sing,gent",озеро,"NOUN,inan,neut @ sing,nomn",110


In [8]:
%%time
for _ in range(140000):
    morph.search('надёжность')

CPU times: user 1.13 s, sys: 0 ns, total: 1.13 s
Wall time: 1.14 s


In [9]:
!du -h data/*

508M	data/annot.opcorpora.xml
402M	data/dict.opcorpora.xml
8,0K	data/prefixes.txt
1,9M	data/replaces.bin
172K	data/suffixes.txt
220K	data/tags.txt
28M	data/words.dawg
