In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# -*- coding: utf-8 -*-
"""
:mod:`pymorphy2.opencorpora_dict.parse` is a
module for OpenCorpora XML dictionaries parsing.
"""
from __future__ import absolute_import, unicode_literals, division

import logging
import collections

try:
    from lxml.etree import iterparse

    def xml_clear_elem(elem):
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]

except ImportError:
    try:
        from xml.etree.cElementTree import iterparse
    except ImportError:
        from xml.etree.ElementTree import iterparse

    def xml_clear_elem(elem):
        elem.clear()


logger = logging.getLogger(__name__)

ParsedDictionary = collections.namedtuple('ParsedDictionary', 'lexemes links grammemes version revision')


def get_dictionary_info(filename, elem_limit=1000):
    """ Return dictionary version and revision """
    for idx, (ev, elem) in enumerate(iterparse(filename, events=(str('start'),))):
        if elem.tag == 'dictionary':
            version = elem.get('version')
            revision = elem.get('revision')
            return version, revision
        if idx > elem_limit:
            return None, None
    return None, None


def parse_opencorpora_xml(filename):
    """
    Parse OpenCorpora dict XML and return a ``ParsedDictionary`` namedtuple.
    """

    links = []
    lexemes = {}
    grammemes = []

    version, revision = get_dictionary_info(filename)
    logger.info("dictionary v%s, rev%s", version, revision)
    interesting_tags = set(['grammeme', 'lemma', 'link'])

    def _parse(filename):
        for ev, elem in iterparse(filename):
            if elem.tag not in interesting_tags:
                continue
            yield ev, elem

    logger.info("parsing XML dictionary")

    for ev, elem in _parse(filename):
#         print(ev)
#         print(elem)
        if elem.tag == 'grammeme':
            name = elem.find('name').text
            parent = elem.get('parent')
            alias = elem.find('alias').text
            description = elem.find('description').text

            grammeme = (name, parent, alias, description)
            grammemes.append(grammeme)
            xml_clear_elem(elem)
        if elem.tag == 'lemma':
            lex_id, word_forms = _word_forms_from_xml_elem(elem)
            lexemes[lex_id] = word_forms
            xml_clear_elem(elem)
            
        elif elem.tag == 'link':
            break
            link_tuple = (
                elem.get('from'),
                elem.get('to'),
                elem.get('type'),
            )
            links.append(link_tuple)
            xml_clear_elem(elem)
        break


    return ParsedDictionary(
        lexemes=lexemes,
        links=links,
        grammemes=grammemes,
        version=version,
        revision=revision
    )


def _grammemes_from_elem(elem):
    return ",".join([g.get('v') for g in elem.iter('g')])


def _word_forms_from_xml_elem(elem):
    """
    Return a list of (word, tag) pairs given "lemma" XML element.
    """
    lexeme = []
    lex_id = elem.get('id')

    if len(elem) == 0:  # deleted lexeme?
        return lex_id, lexeme

    base_info = list(elem.iter('l'))

    assert len(base_info) == 1
    base_grammemes = _grammemes_from_elem(base_info[0])

    for form_elem in elem.iter('f'):
        grammemes = _grammemes_from_elem(form_elem)
        form = form_elem.get('t').lower()
        if not (base_grammemes + grammemes):
            logger.warning("no information provided for word %s, dropping the whole lexeme" % form)
            return lex_id, []
        if isinstance(form, bytes):
            form = form.decode('ascii')
        lexeme.append(
            (form, (base_grammemes + " " + grammemes).strip())
        )

    return lex_id, lexeme

In [3]:
parsed_dict = parse_opencorpora_xml("dict.opcorpora.xml")
parsed_dict

ParsedDictionary(lexemes={}, links=[], grammemes=[('POST', '', 'ЧР', 'часть речи')], version='0.92', revision='406585')

In [4]:
def with_progress(iterable, desc=None, total=None, leave=True):
    """
    Return an iterator which prints the iteration progress using tqdm package.
    Return iterable intact if tqdm is not available.
    """
    try:
        from tqdm import tqdm

        # workarounds for tqdm bugs
        def _it(iterable, desc, total, leave):
            if total is None:
                try:
                    total = len(iterable)
                except Exception:
                    total = 0
            for el in tqdm(iterable, desc=desc, total=total, leave=leave):
                yield el
            if leave:
                print("")

        return _it(iterable, desc, total, leave)

    except ImportError:
        return iterable
    
#     for ev, elem in with_progress(_parse(filename), "XML parsing"):
#         print("KEK")
#         if elem.tag == 'grammeme':
#             name = elem.find('name').text
#             parent = elem.get('parent')
#             alias = elem.find('alias').text
#             description = elem.find('description').text

#             grammeme = (name, parent, alias, description)
#             grammemes.append(grammeme)
#             xml_clear_elem(elem)

#         if elem.tag == 'lemma':
#             lex_id, word_forms = _word_forms_from_xml_elem(elem)
#             lexemes[lex_id] = word_forms
#             xml_clear_elem(elem)

#         elif elem.tag == 'link':
#             link_tuple = (
#                 elem.get('from'),
#                 elem.get('to'),
#                 elem.get('type'),
#             )
#             links.append(link_tuple)
#             xml_clear_elem(elem)


In [5]:
get_dictionary_info("dict.opcorpora.xml")

('0.92', '406585')

In [6]:
import opencorpora
corpus = opencorpora.load("dict.opcorpora.xml")
corpus

<Element dictionary at 0x11131f748>

In [7]:
# i = 0
# for elem in corpus.getiterator():
# #     print(elem)
#     i += 1
#     if elem.tag == 'link':
        
#         print(link_tuple)
# #     if i == 100000:
# #         break

In [8]:
# i = 0
# link_types = None
# for elem in corpus.getiterator():
#     i += 1
#     if elem.tag == 'link_types':
#         print(elem)
#         link_types = elem
# #     if i == 100000:
# #         break

In [9]:
# i = 0
# for val in link_types.getiterator():
#     print(val.attrib)
#     print(val.text)
#     i += 1
#     if i == 3:
#         break

In [None]:
d = {'lex_id': [], 'word_norm': [], 'word_form': [], 'morph_features_form':[], 'morph_features_norm': []}

In [1]:
i = 0
for elem in corpus.getiterator():
#     print(elem)
    i += 1
    if elem.tag == 'lemma':
        lex_id, word_forms = _word_forms_from_xml_elem(elem)
#         print(lex_id)
#         print(word_forms)
        l = len(word_forms)
        if l < 2:
            if len(word_forms[0]) < 2:
                print(word_forms)
            d['lex_id'].append(lex_id)
            d['word_norm'].append(word_forms[0][0])
            d['word_form'].append(word_forms[0][0])
            d['morph_features_form'].append(word_forms[0][1])
            d['morph_features_norm'].append(word_forms[0][1])
            
        for i in range(1, l):
            if len(word_forms[0]) < 2:
                print(word_forms)
            d['lex_id'].append(lex_id)
            d['word_norm'].append(word_forms[0][0])
            d['word_form'].append(word_forms[i][0])
            d['morph_features_form'].append(word_forms[i][1])
            d['morph_features_norm'].append(word_forms[0][1])
        if len(d['lex_id']) != len(d['morph_features_form']):
            print(len(d['lex_id']))
            print(len(d['morph_features_form']))
            print(word_forms)
            break
#     if elem.tag == 'grammeme':
#         name = elem.find('name').text
#         parent = elem.get('parent')
#         alias = elem.find('alias').text
#         description = elem.find('description').text
#         print(name)
#         print(parent)
#         print(alias)
#         print(description)
    if i == 10000:
        break

NameError: name 'corpus' is not defined

In [None]:
print(len(d["lex_id"]))
print(len(d["word_norm"]))
print(len(d["word_form"]))
print(len(d["morph_features_form"]))
print(len(d["morph_features_norm"]))


In [None]:
data_pairs = pd.DataFrame(data=d)

In [None]:
data_pairs.head()

In [None]:
data_pairs.to_csv("base_pairs_1.csv")

In [3]:
data_pairs = pd.read_csv("base_pairs.csv", index_col=0)

  mask |= (ar1 == a)


In [5]:
data_pairs.head()

Unnamed: 0,lex_id,word_norm,word_form,morph_features_form,morph_features_norm
0,1,ёж,ежа,"NOUN,anim,masc sing,gent","NOUN,anim,masc sing,nomn"
1,1,ёж,ежу,"NOUN,anim,masc sing,datv","NOUN,anim,masc sing,nomn"
2,1,ёж,ежа,"NOUN,anim,masc sing,accs","NOUN,anim,masc sing,nomn"
3,1,ёж,ежом,"NOUN,anim,masc sing,ablt","NOUN,anim,masc sing,nomn"
4,1,ёж,еже,"NOUN,anim,masc sing,loct","NOUN,anim,masc sing,nomn"


In [6]:
data_pairs.morph_features_form.unique()

array(['NOUN,anim,masc sing,gent', 'NOUN,anim,masc sing,datv',
       'NOUN,anim,masc sing,accs', ...,
       'NOUN,inan,masc,Sgtm,Infr sing,accs',
       'NOUN,inan,masc,Sgtm,Infr sing,ablt',
       'NOUN,inan,masc,Sgtm,Infr sing,loct'], dtype=object)

In [7]:
df_morph = data_pairs[["morph_features_form"]]

In [8]:
df_morph["count"] = [1] * df_morph.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [9]:
df_morph.groupby("morph_features_form").sum().sort_values("count")

Unnamed: 0_level_0,count
morph_features_form,Unnamed: 1_level_1
"NOUN,inan,neut,Fixd,Abbr,Sgtm,Orgn sing,accs",1
"NOUN,inan,femn,Sgtm,Arch sing,loct",1
"NOUN,inan,femn,Sgtm,Erro sing,ablt",1
"NOUN,inan,femn,Sgtm,Erro sing,ablt,V-ey",1
"NOUN,inan,femn,Sgtm,Erro sing,accs",1
"NOUN,inan,femn,Sgtm,Erro sing,datv",1
"NOUN,inan,femn,Sgtm,Erro sing,gent",1
"COMP,Qual,Arch V-ej",1
"COMP,Qual,Arch Cmp2,V-ej",1
"COMP,Qual,Arch Cmp2",1


In [10]:
res_morph = []
for elem in df_morph['morph_features_form'].values:
    res_morph.append(elem.split(","))

In [11]:
res_morph

[['NOUN', 'anim', 'masc sing', 'gent'],
 ['NOUN', 'anim', 'masc sing', 'datv'],
 ['NOUN', 'anim', 'masc sing', 'accs'],
 ['NOUN', 'anim', 'masc sing', 'ablt'],
 ['NOUN', 'anim', 'masc sing', 'loct'],
 ['NOUN', 'anim', 'masc plur', 'nomn'],
 ['NOUN', 'anim', 'masc plur', 'gent'],
 ['NOUN', 'anim', 'masc plur', 'datv'],
 ['NOUN', 'anim', 'masc plur', 'accs'],
 ['NOUN', 'anim', 'masc plur', 'ablt'],
 ['NOUN', 'anim', 'masc plur', 'loct'],
 ['NOUN', 'inan', 'masc sing', 'gent'],
 ['NOUN', 'inan', 'masc sing', 'datv'],
 ['NOUN', 'inan', 'masc sing', 'accs'],
 ['NOUN', 'inan', 'masc sing', 'ablt'],
 ['NOUN', 'inan', 'masc sing', 'loct'],
 ['NOUN', 'inan', 'masc plur', 'nomn'],
 ['NOUN', 'inan', 'masc plur', 'gent'],
 ['NOUN', 'inan', 'masc plur', 'datv'],
 ['NOUN', 'inan', 'masc plur', 'accs'],
 ['NOUN', 'inan', 'masc plur', 'ablt'],
 ['NOUN', 'inan', 'masc plur', 'loct'],
 ['NOUN', 'anim', 'masc sing', 'gent'],
 ['NOUN', 'anim', 'masc sing', 'datv'],
 ['NOUN', 'anim', 'masc sing', 'accs'],


In [12]:
lenght = []
for i in range(len(res_morph)):
    lenght.append(len(res_morph[i]))

In [13]:
np.unique(np.array(lenght), return_counts=True)

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([  28215,  218632,  714139,  929036,  656025,  784209, 1209710,
         210508,    2127,     295]))