In [1]:
import dawg
import pandas as pd

In [2]:
from lemma.score import get_score
from lemma.utils import load_json
from lemma.utils import dump_lines
from lemma.parse import parse_dict
from lemma.merge import merge_lemmas

In [3]:
from lemma.split import get_stem
from lemma.split import get_prefixes
from lemma.split import get_suffixes

In [4]:
%time data, cols = parse_dict('data/dict.opcorpora.xml')

CPU times: user 56.9 s, sys: 1.58 s, total: 58.5 s
Wall time: 58.5 s


In [5]:
cols.keys(), data.keys()

(dict_keys(['types', 'links', 'lemmas', 'grammemes']),
 dict_keys(['types', 'links', 'lemmas', 'grammemes']))

In [6]:
dfs = {k: pd.DataFrame(data[k], columns=cols[k]) for k in data}

In [7]:
dfs['types'].head()

Unnamed: 0,id,type
0,1,ADJF-ADJS
1,2,ADJF-COMP
2,3,INFN-VERB
3,4,INFN-PRTF
4,5,INFN-GRND


In [8]:
dfs['links'].head()

Unnamed: 0,id,from,to,type
0,1,5,6,1
1,2,5,7,2
2,3,9,8,3
3,4,9,10,4
4,5,9,12,5


In [9]:
dfs['lemmas'].head()

Unnamed: 0,id,tag,word,form
0,1,"NOUN,anim,masc",ёж,"sing,nomn"
1,1,"NOUN,anim,masc",ежа,"sing,gent"
2,1,"NOUN,anim,masc",ежу,"sing,datv"
3,1,"NOUN,anim,masc",ежа,"sing,accs"
4,1,"NOUN,anim,masc",ежом,"sing,ablt"


In [10]:
dfs['grammemes'].head()

Unnamed: 0,name,alias,parent,description
0,POST,ЧР,,часть речи
1,NOUN,СУЩ,POST,имя существительное
2,ADJF,ПРИЛ,POST,имя прилагательное (полное)
3,ADJS,КР_ПРИЛ,POST,имя прилагательное (краткое)
4,COMP,КОМП,POST,компаратив


In [11]:
df = dfs['lemmas']

In [12]:
df[df['word'] == 'небо']

Unnamed: 0,id,tag,word,form
2285339,179438,"NOUN,inan,neut",небо,"sing,nomn"
2285342,179438,"NOUN,inan,neut",небо,"sing,accs"


In [13]:
df[df['word'] == 'нёбо']

Unnamed: 0,id,tag,word,form
5097726,390849,"NOUN,inan,neut",нёбо,"sing,nomn"
5097729,390849,"NOUN,inan,neut",нёбо,"sing,accs"


In [14]:
df[df['word'] == 'озера']

Unnamed: 0,id,tag,word,form
2658960,204182,"NOUN,inan,neut",озера,"sing,gent"


In [15]:
df[df['word'] == 'озёра']

Unnamed: 0,id,tag,word,form
2658965,204182,"NOUN,inan,neut",озёра,"plur,nomn"
2658968,204182,"NOUN,inan,neut",озёра,"plur,accs"


In [16]:
df[df['word'] == 'любовь']

Unnamed: 0,id,tag,word,form
1917905,152019,"NOUN,inan,femn",любовь,"sing,nomn"
1917908,152019,"NOUN,inan,femn",любовь,"sing,accs"
1917926,152020,"NOUN,anim,femn,Name",любовь,"sing,nomn"
1917929,152020,"NOUN,anim,femn,Name",любовь,"sing,accs"


In [17]:
df[df['word'] == 'александра']

Unnamed: 0,id,tag,word,form
102225,8527,"NOUN,anim,masc,Name",александра,"sing,gent"
102227,8527,"NOUN,anim,masc,Name",александра,"sing,accs"
102260,8530,"NOUN,anim,femn,Name",александра,"sing,nomn"


In [18]:
mapping = merge_lemmas(data['links'], {7,21,23,27})

100%|██████████████████████████████████| 258650/258650 [20:21<00:00, 211.74it/s]


In [19]:
df['id'] = df['id'].map(mapping).fillna(df['id']).astype('uint32')

In [20]:
df['tag'] = df['tag'] + ' @ ' + df['form']

In [21]:
df = df.drop(columns='form')

In [47]:
max_id = df['id'].max()
extras = load_json('data/dict.custom.json')

for idx, lemma in enumerate(extras):
    ldf = pd.DataFrame(lemma, columns=['word','tag'])
    df = pd.concat((df, ldf.assign(id = max_id + idx + 1)))

df = df.reset_index(drop=True)

In [50]:
df['score'] = df['tag'].apply(get_score)

In [55]:
df

Unnamed: 0,id,tag,word,score
0,1,"NOUN,anim,masc @ sing,nomn",ёж,16
1,1,"NOUN,anim,masc @ sing,gent",ежа,14
2,1,"NOUN,anim,masc @ sing,datv",ежу,14
3,1,"NOUN,anim,masc @ sing,accs",ежа,14
4,1,"NOUN,anim,masc @ sing,ablt",ежом,14
...,...,...,...,...
5141299,395250,"NOUN,inan,femn,Infr @ plur,gent",винд,10
5141300,395250,"NOUN,inan,femn,Infr @ plur,datv",виндам,10
5141301,395250,"NOUN,inan,femn,Infr @ plur,accs",винды,10
5141302,395250,"NOUN,inan,femn,Infr @ plur,ablt",виндами,10


In [56]:
def split_words(group):

    words = group['word'].tolist()

    stem = get_stem(words)
    prefixes = get_prefixes(words, stem)
    suffixes = get_suffixes(words, stem)
    offsets = [len(p) for p in prefixes]

    group['prefix'] = prefixes
    group['suffix'] = suffixes
    group['stem_offset'] = offsets
    group['stem_length'] = len(stem)

    return group

df = df.groupby('id').apply(split_words, include_groups=False)
df = df.droplevel(1).reset_index(drop=False)

In [57]:
df

Unnamed: 0,id,tag,word,score,prefix,suffix,stem_offset,stem_length
0,1,"NOUN,anim,masc @ sing,nomn",ёж,16,ё,,1,1
1,1,"NOUN,anim,masc @ sing,gent",ежа,14,е,а,1,1
2,1,"NOUN,anim,masc @ sing,datv",ежу,14,е,у,1,1
3,1,"NOUN,anim,masc @ sing,accs",ежа,14,е,а,1,1
4,1,"NOUN,anim,masc @ sing,ablt",ежом,14,е,ом,1,1
...,...,...,...,...,...,...,...,...
5141299,395250,"NOUN,inan,femn,Infr @ plur,gent",винд,10,,,0,4
5141300,395250,"NOUN,inan,femn,Infr @ plur,datv",виндам,10,,ам,0,4
5141301,395250,"NOUN,inan,femn,Infr @ plur,accs",винды,10,,ы,0,4
5141302,395250,"NOUN,inan,femn,Infr @ plur,ablt",виндами,10,,ами,0,4


In [58]:
df['tag'] = df['tag'].astype('category')
df['prefix'] = df['prefix'].astype('category')
df['suffix'] = df['suffix'].astype('category')

In [59]:
df['norm_idx'] = df.groupby('id')['score'].transform(lambda x: x.idxmax())

In [60]:
params = {
    'right_index': True, 
    'left_on': 'norm_idx', 
    'suffixes': ('', '_norm')
}

cols = ['prefix', 'suffix', 'tag']
df = pd.merge(df, df[cols], **params)
df = df.drop(columns='norm_idx')

In [61]:
df

Unnamed: 0,id,tag,word,score,prefix,suffix,stem_offset,stem_length,prefix_norm,suffix_norm,tag_norm
0,1,"NOUN,anim,masc @ sing,nomn",ёж,16,ё,,1,1,ё,,"NOUN,anim,masc @ sing,nomn"
1,1,"NOUN,anim,masc @ sing,gent",ежа,14,е,а,1,1,ё,,"NOUN,anim,masc @ sing,nomn"
2,1,"NOUN,anim,masc @ sing,datv",ежу,14,е,у,1,1,ё,,"NOUN,anim,masc @ sing,nomn"
3,1,"NOUN,anim,masc @ sing,accs",ежа,14,е,а,1,1,ё,,"NOUN,anim,masc @ sing,nomn"
4,1,"NOUN,anim,masc @ sing,ablt",ежом,14,е,ом,1,1,ё,,"NOUN,anim,masc @ sing,nomn"
...,...,...,...,...,...,...,...,...,...,...,...
5141299,395250,"NOUN,inan,femn,Infr @ plur,gent",винд,10,,,0,4,,а,"NOUN,inan,femn,Infr @ sing,nomn"
5141300,395250,"NOUN,inan,femn,Infr @ plur,datv",виндам,10,,ам,0,4,,а,"NOUN,inan,femn,Infr @ sing,nomn"
5141301,395250,"NOUN,inan,femn,Infr @ plur,accs",винды,10,,ы,0,4,,а,"NOUN,inan,femn,Infr @ sing,nomn"
5141302,395250,"NOUN,inan,femn,Infr @ plur,ablt",виндами,10,,ами,0,4,,а,"NOUN,inan,femn,Infr @ sing,nomn"


In [62]:
def save_categories(series, path):
    dump_lines(series.cat.categories.tolist(), path)

save_categories(df['tag'], 'data/tags.txt')
save_categories(df['prefix'], 'data/prefixes.txt')
save_categories(df['suffix'], 'data/suffixes.txt')

In [63]:
keys = df['word'].copy()

keys += ',' + df['tag'].cat.codes.astype(str)
keys += ',' + df['stem_offset'].astype(str)
keys += ',' + df['stem_length'].astype(str)

keys += ',' + df['tag_norm'].cat.codes.astype(str)
keys += ',' + df['prefix_norm'].cat.codes.astype(str)
keys += ',' + df['suffix_norm'].cat.codes.astype(str)

In [64]:
params = {
    'sep': ',',
    'keys': keys.tolist(),
    'replaces': {'е':'ё'}
}

dwg = dawg.DAWG(**params)
dwg.save('data/words.dawg')