In [1]:
import pandas as pd
from tqdm import tqdm

from utils import lcs
from utils import load_lines
from utils import dump_lines
from utils import load_pickle
from utils import dump_pickle

from score import get_score
from parse import parse_dict
from link import get_replaces

In [2]:
from lxml import etree
from parse import parse_lemma

In [24]:
def parse_token(elem):

    l = elem.\
            getchildren()[0].\
            getchildren()[0].\
            getchildren()[0]

    text = elem.attrib['text'].lower()
    gs = [g.attrib['v'] for g in l.getchildren()]

    return text, ','.join(gs)


def parse_annot(path):

    c = 0

    words = dict()
    forms = dict()
    
    items = etree.iterparse(path)

    for ev, elem in items:
        if elem.tag == 'token':
            t = parse_token(elem)
            elem.clear()

            if t[0] not in forms:
                forms[t[0]] = dict()

            if t[1] not in forms[t[0]]:
                forms[t[0]][t[1]] = 0

            forms[t[0]][t[1]] += 1

            '''
            if t not in forms:
                forms[t] = 0

            forms[t] += 1
            '''

            key = t[0].replace('ё','е')

            if key not in words:
                words[key] = 0

            words[key] += 1

            # c += 1
            # if c > 10: break

    return {'words': words, 'forms': forms}

data = parse_annot('data/annot.opcorpora.xml')

In [45]:
data['words']['валерия']

13

In [46]:
data['forms']['валерия']

{'NOUN,anim,femn,Name,sing,nomn': 1,
 'NOUN,anim,masc,Name,sing,gent': 7,
 'NOUN,anim,masc,Name,sing,accs': 5}

In [30]:
data['words']['озера']

31

In [27]:
data['forms']['озера']

{'NOUN,inan,neut,sing,gent': 28, 'NOUN,inan,neut,plur,accs': 1}

In [28]:
data['forms']['озёра']

{'NOUN,inan,neut,plur,accs': 1, 'NOUN,inan,neut,plur,nomn': 1}

In [2]:
%time data, cols = parse_dict('data/dict.opcorpora.xml')

CPU times: user 1min 26s, sys: 1.73 s, total: 1min 28s
Wall time: 1min 29s


In [3]:
replaces = get_replaces(data['links'])
dump_pickle(replaces, 'data/replaces.bin')

100%|████████████████████████████████████████████████████████████| 258650/258650 [32:51<00:00, 131.22it/s]
100%|███████████████████████████████████████████████████████████| 66195/66195 [00:00<00:00, 465020.62it/s]

CPU times: user 32min 38s, sys: 12 s, total: 32min 50s
Wall time: 32min 51s





In [9]:
df = pd.DataFrame(data['lemmas'], columns=cols['lemmas'])

In [17]:
%%time
df['tag'] = df['tag'] + ' @ ' + df['form']
df['norm_score'] = df['tag'].apply(get_score)
df['id'] = df['id'].map(replaces).fillna(df['id']).astype('uint32')
df['norm_idx'] = df.groupby('id')['norm_score'].transform(lambda x: x.idxmax())

CPU times: user 50.1 s, sys: 1.58 s, total: 51.7 s
Wall time: 51.3 s


In [18]:
def split(group):
    
    forms = group['word'].tolist()
    stem = lcs(forms)
    prefixes = [form[:form.index(stem)] for form in forms]
    
    suffixes = [
        form[len(pref)+len(stem):]
        for form, pref in zip(forms, prefixes)
    ]

    offsets = [len(pref) for pref in prefixes]
    
    group['prefix'] = prefixes
    group['suffix'] = suffixes
    
    group['stem_offset'] = offsets
    group['stem_length'] = len(stem)
    
    return group

tqdm.pandas()
df = df.groupby('id', group_keys=False).progress_apply(split)
df = df.reset_index(drop=True)

100%|████████████████████████████████████████████████████████████| 183259/183259 [03:59<00:00, 765.23it/s]


In [19]:
df['tag'] = df['tag'].astype('category')
df['prefix'] = df['prefix'].astype('category')
df['suffix'] = df['suffix'].astype('category')

In [20]:
params = {
    'right_index': True, 
    'left_on': 'norm_idx', 
    'suffixes': ('', '_norm')
}
cols = ['prefix', 'suffix', 'tag']
df = pd.merge(df, df[cols], **params)

In [23]:
def save_categories(series, path):
    dump_lines(series.cat.categories.tolist(), path)

save_categories(df['tag'], 'data/tags.txt')
save_categories(df['prefix'], 'data/prefixes.txt')
save_categories(df['suffix'], 'data/suffixes.txt')

In [22]:
keys = df['word'].copy()

keys += ',' + df['tag'].cat.codes.astype(str)
keys += ',' + df['stem_offset'].astype(str)
keys += ',' + df['stem_length'].astype(str)

keys += ',' + df['tag_norm'].cat.codes.astype(str)
keys += ',' + df['prefix_norm'].cat.codes.astype(str)
keys += ',' + df['suffix_norm'].cat.codes.astype(str)

In [24]:
params = {
    'sep': ',',
    'keys': keys.tolist(),
    'replaces': {'е':'ё'}
}

dwg = dawg.DAWG(**params)
dwg.save('data/words.dawg')