In [None]:
import pandas as pd
from tqdm import tqdm

from utils import lcs
from utils import load_lines
from utils import dump_lines
from utils import load_pickle
from utils import dump_pickle

from score import get_score
from parse import parse_dict
from link import get_replaces

In [2]:
%time data, cols = parse_dict('data/dict.opcorpora.xml')

CPU times: user 1min 26s, sys: 1.73 s, total: 1min 28s
Wall time: 1min 29s


In [3]:
replaces = get_replaces(data['links'])
dump_pickle(replaces, 'data/replaces.bin')

100%|████████████████████████████████████████████████████████████| 258650/258650 [32:51<00:00, 131.22it/s]
100%|███████████████████████████████████████████████████████████| 66195/66195 [00:00<00:00, 465020.62it/s]

CPU times: user 32min 38s, sys: 12 s, total: 32min 50s
Wall time: 32min 51s





In [9]:
df = pd.DataFrame(data['lemmas'], columns=cols['lemmas'])

In [17]:
%%time
df['tag'] = df['tag'] + ' @ ' + df['form']
df['norm_score'] = df['tag'].apply(get_score)
df['id'] = df['id'].map(replaces).fillna(df['id']).astype('uint32')
df['norm_idx'] = df.groupby('id')['norm_score'].transform(lambda x: x.idxmax())

CPU times: user 50.1 s, sys: 1.58 s, total: 51.7 s
Wall time: 51.3 s


In [18]:
def split(group):
    
    forms = group['word'].tolist()
    stem = lcs(forms)
    prefixes = [form[:form.index(stem)] for form in forms]
    
    suffixes = [
        form[len(pref)+len(stem):]
        for form, pref in zip(forms, prefixes)
    ]

    offsets = [len(pref) for pref in prefixes]
    
    group['prefix'] = prefixes
    group['suffix'] = suffixes
    
    group['stem_offset'] = offsets
    group['stem_length'] = len(stem)
    
    return group

tqdm.pandas()
df = df.groupby('id', group_keys=False).progress_apply(split)
df = df.reset_index(drop=True)

100%|████████████████████████████████████████████████████████████| 183259/183259 [03:59<00:00, 765.23it/s]


In [19]:
df['tag'] = df['tag'].astype('category')
df['prefix'] = df['prefix'].astype('category')
df['suffix'] = df['suffix'].astype('category')

In [20]:
params = {
    'right_index': True, 
    'left_on': 'norm_idx', 
    'suffixes': ('', '_norm')
}
cols = ['prefix', 'suffix', 'tag']
df = pd.merge(df, df[cols], **params)

In [23]:
def save_categories(series, path):
    dump_lines(series.cat.categories.tolist(), path)

save_categories(df['tag'], 'data/tags.txt')
save_categories(df['prefix'], 'data/prefixes.txt')
save_categories(df['suffix'], 'data/suffixes.txt')

In [22]:
keys = df['word'].copy()

keys += ',' + df['tag'].cat.codes.astype(str)
keys += ',' + df['stem_offset'].astype(str)
keys += ',' + df['stem_length'].astype(str)

keys += ',' + df['tag_norm'].cat.codes.astype(str)
keys += ',' + df['prefix_norm'].cat.codes.astype(str)
keys += ',' + df['suffix_norm'].cat.codes.astype(str)

In [24]:
params = {
    'sep': ',',
    'keys': keys.tolist(),
    'replaces': {'е':'ё'}
}

dwg = dawg.DAWG(**params)
dwg.save('data/words.dawg')