In [44]:
import dawg

import pandas as pd
from tqdm import tqdm

from utils import lcs
from utils import load_lines
from utils import dump_lines
from utils import load_pickle
from utils import dump_pickle

from score import get_score
from parse import parse_dict
from parse import parse_annot
from link import get_replaces

In [3]:
%time counts = parse_annot('data/annot.opcorpora.xml')

CPU times: user 48.2 s, sys: 2.3 s, total: 50.5 s
Wall time: 50.9 s


In [4]:
%time data, cols = parse_dict('data/dict.opcorpora.xml')

CPU times: user 1min 18s, sys: 1.37 s, total: 1min 19s
Wall time: 1min 20s


In [3]:
replaces = get_replaces(data['links'])
dump_pickle(replaces, 'data/replaces.bin')

100%|████████████████████████████████████████████████████████████| 258650/258650 [32:51<00:00, 131.22it/s]
100%|███████████████████████████████████████████████████████████| 66195/66195 [00:00<00:00, 465020.62it/s]

CPU times: user 32min 38s, sys: 12 s, total: 32min 50s
Wall time: 32min 51s





In [21]:
replaces = load_pickle('data/replaces.bin')

In [5]:
df = pd.DataFrame(data['lemmas'], columns=cols['lemmas'])

In [7]:
df['cnt'] = df['id'].map(counts).fillna(0).astype(int)

In [8]:
(df['cnt'] > 0).sum(), (df['cnt'] == 0).sum()

(873978, 4267289)

In [9]:
df[df['word'] == 'а']

Unnamed: 0,id,tag,word,form,cnt
605,51,CONJ,а,,9643
606,52,INTJ,а,,7893
607,53,PRCL,а,,7892
5089493,390180,"NOUN,anim,ms-f,Sgtm,Fixd,Abbr,Name,Init",а,"sing,nomn",3642
5089494,390180,"NOUN,anim,ms-f,Sgtm,Fixd,Abbr,Name,Init",а,"sing,gent",3642
5089495,390180,"NOUN,anim,ms-f,Sgtm,Fixd,Abbr,Name,Init",а,"sing,datv",3642
5089496,390180,"NOUN,anim,ms-f,Sgtm,Fixd,Abbr,Name,Init",а,"sing,accs",3642
5089497,390180,"NOUN,anim,ms-f,Sgtm,Fixd,Abbr,Name,Init",а,"sing,ablt",3642
5089498,390180,"NOUN,anim,ms-f,Sgtm,Fixd,Abbr,Name,Init",а,"sing,loct",3642
5089499,390181,"NOUN,anim,ms-f,Sgtm,Fixd,Abbr,Patr,Init",а,"sing,nomn",3642


In [19]:
df[df['word'] == 'осел']

Unnamed: 0,id,tag,word,form,cnt
2734698,209868,"VERB,perf,intr",осел,"masc,sing,past,indc",5


In [10]:
df[df['word'] == 'осёл']

Unnamed: 0,id,tag,word,form,cnt
2728608,209408,"NOUN,anim,masc",осёл,"sing,nomn",16


In [20]:
df[df['word'] == 'небо']

Unnamed: 0,id,tag,word,form,cnt
2285339,179438,"NOUN,inan,neut",небо,"sing,nomn",304
2285342,179438,"NOUN,inan,neut",небо,"sing,accs",304


In [11]:
df[df['word'] == 'нёбо']

Unnamed: 0,id,tag,word,form,cnt
5097726,390849,"NOUN,inan,neut",нёбо,"sing,nomn",190
5097729,390849,"NOUN,inan,neut",нёбо,"sing,accs",190


In [12]:
df[df['word'] == 'вера']

Unnamed: 0,id,tag,word,form,cnt
490950,41143,"NOUN,inan,femn",вера,"sing,nomn",181
490963,41144,"NOUN,anim,femn,Name",вера,"sing,nomn",55


In [13]:
df[df['word'] == 'любовь']

Unnamed: 0,id,tag,word,form,cnt
1917905,152019,"NOUN,inan,femn",любовь,"sing,nomn",460
1917908,152019,"NOUN,inan,femn",любовь,"sing,accs",460
1917926,152020,"NOUN,anim,femn,Name",любовь,"sing,nomn",174
1917929,152020,"NOUN,anim,femn,Name",любовь,"sing,accs",174


In [14]:
df[df['word'] == 'озера']

Unnamed: 0,id,tag,word,form,cnt
2658960,204182,"NOUN,inan,neut",озера,"sing,gent",110


In [15]:
df[df['word'] == 'озёра']

Unnamed: 0,id,tag,word,form,cnt
2658965,204182,"NOUN,inan,neut",озёра,"plur,nomn",110
2658968,204182,"NOUN,inan,neut",озёра,"plur,accs",110


In [16]:
df[df['word'] == 'валерия']

Unnamed: 0,id,tag,word,form,cnt
455226,38160,"NOUN,anim,masc,Name",валерия,"sing,gent",69
455228,38160,"NOUN,anim,masc,Name",валерия,"sing,accs",69
455419,38174,"NOUN,anim,femn,Name",валерия,"sing,nomn",31


In [17]:
df[df['word'] == 'александра']

Unnamed: 0,id,tag,word,form,cnt
102225,8527,"NOUN,anim,masc,Name",александра,"sing,gent",561
102227,8527,"NOUN,anim,masc,Name",александра,"sing,accs",561
102260,8530,"NOUN,anim,femn,Name",александра,"sing,nomn",218


In [18]:
max(counts.values())

467644

In [22]:
%%time
df['tag'] = df['tag'] + ' @ ' + df['form']
df['norm_score'] = df['tag'].apply(get_score)
df['id'] = df['id'].map(replaces).fillna(df['id']).astype('uint32')
df['norm_idx'] = df.groupby('id')['norm_score'].transform(lambda x: x.idxmax())

CPU times: user 50.4 s, sys: 675 ms, total: 51 s
Wall time: 51.5 s


In [23]:
def split(group):
    
    forms = group['word'].tolist()
    stem = lcs(forms)
    prefixes = [form[:form.index(stem)] for form in forms]
    
    suffixes = [
        form[len(pref)+len(stem):]
        for form, pref in zip(forms, prefixes)
    ]

    offsets = [len(pref) for pref in prefixes]
    
    group['prefix'] = prefixes
    group['suffix'] = suffixes
    
    group['stem_offset'] = offsets
    group['stem_length'] = len(stem)
    
    return group

tqdm.pandas()
df = df.groupby('id', group_keys=False).progress_apply(split)
df = df.reset_index(drop=True)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 183259/183259 [04:01<00:00, 760.38it/s]


In [24]:
df['tag'] = df['tag'].astype('category')
df['prefix'] = df['prefix'].astype('category')
df['suffix'] = df['suffix'].astype('category')

In [25]:
params = {
    'right_index': True, 
    'left_on': 'norm_idx', 
    'suffixes': ('', '_norm')
}
cols = ['prefix', 'suffix', 'tag']
df = pd.merge(df, df[cols], **params)

In [27]:
def save_categories(series, path):
    dump_lines(series.cat.categories.tolist(), path)

save_categories(df['tag'], 'data/tags.txt')
save_categories(df['prefix'], 'data/prefixes.txt')
save_categories(df['suffix'], 'data/suffixes.txt')

In [28]:
keys = df['word'].copy()

keys += ',' + df['tag'].cat.codes.astype(str)
keys += ',' + df['stem_offset'].astype(str)
keys += ',' + df['stem_length'].astype(str)

keys += ',' + df['tag_norm'].cat.codes.astype(str)
keys += ',' + df['prefix_norm'].cat.codes.astype(str)
keys += ',' + df['suffix_norm'].cat.codes.astype(str)

keys += ',' + df['cnt'].astype(str)

In [42]:
keys[999558]

'дельным,849,0,3,825,0,11638,12'

In [45]:
params = {
    'sep': ',',
    'keys': keys.tolist(),
    'replaces': {'е':'ё'}
}

dwg = dawg.DAWG(**params)
dwg.save('data/words.dawg')