# Imports & Co

In [None]:
!pip install conllu

Collecting conllu
  Downloading conllu-4.4.2-py2.py3-none-any.whl (15 kB)
Installing collected packages: conllu
Successfully installed conllu-4.4.2


In [None]:
from drive.MyDrive.trees.staff_func import *
from time import time
from tqdm import tqdm
import os
import json

# Verses

In [None]:
def encode_nodes(tree, cell_id=None, deprel_id=None):
    if not cell_id:
        cell_id = {Cell(): 0, }
        deprel_id = {'root': 0, }

    res_tree = []
    node = make_cell(tree[0])
    if node not in cell_id:
        cell_id[node] = len(cell_id)
    res_tree.append(cell_id[node])

    for el in tree[1:]:
        connect = el[0]
        new_tree = el[1]
        new_tree, cell_id, deprel_id = encode_nodes(new_tree, cell_id, deprel_id)
        if connect not in deprel_id:
            deprel_id[connect] = len(deprel_id)
        res_tree.append([deprel_id[connect], new_tree])

    return res_tree, cell_id, deprel_id

In [None]:
def iter_tree(tree, prev_node=None, prev_deprel=None, cur_node=None, freq_dict=None, freq_matrix=None, three_id=None,
              root_freq=None, deprel_freq=None, deprel_line=None):
    def _update_(el):
        nonlocal three_id
        if el not in three_id:
            three_id[el] = len(three_id)
        return three_id[el]

    def _matrix_(el):
        nonlocal freq_matrix, prev_deps
        if el not in freq_matrix:
            freq_matrix[el] = {}
        for dep in prev_deps:
            if dep not in freq_matrix[el]:
                freq_matrix[el][dep] = 0
            freq_matrix[el][dep] += 1

    def _deprel_inner_(line, deps):
        if len(line) > 0:
            if line[0] not in deps:
                deps[line[0]] = _deprel_inner_(line[1:], {})
            else:
                deps[line[0]] = _deprel_inner_(line[1:], deps[line[0]])
        else:
            if isinstance(deps, int):
                return deps + 1
            else:
                return 1
        return deps

    def _deprel_(line, deps, l=2):
        if len(line) < l:
            line.extend([None for _ in range(l - len(line))])
        for i in range(len(line) - l + 1):
            deps = _deprel_inner_(line[i:i + l], deps)
        return deps

    # если самое начало, то создаем словари (вообще не используется, но мало ли)
    if not freq_dict:
        freq_dict = {}
        three_id = {(None, 0, None): 0, }
        freq_matrix = {}
        root_freq = {}
        deprel_line = []
        deprel_freq = {}

    # если корень, то вручную определяем все требуемые переменные
    if not prev_node:
        if len(tree) < 2:
            return freq_dict, freq_matrix, three_id, root_freq
        prev_node = _update_((None, tree[0], None))
        prev_deprel = _update_((None, tree[1][0], None))
        cur_node = _update_((None, tree[1][1][0], None))
        tree = tree[1][1]
        if cur_node not in root_freq:
            root_freq[cur_node] = 0
        root_freq[cur_node] += 1
        deprel_line.append(prev_deprel)

    # кладем на место все, что там не лежит
    if prev_node not in freq_dict:
        freq_dict[prev_node] = {}
    if prev_deprel not in freq_dict[prev_node]:
        freq_dict[prev_node][prev_deprel] = {}
    if cur_node not in freq_dict[prev_node][prev_deprel]:
        freq_dict[prev_node][prev_deprel][cur_node] = {}

    # собираем уровень потомков в нужном виде
    children = [[None, [None]], ]
    if len(tree) > 1:
        children.extend(tree[1:])
    else:
        children.append([None, [None]])
    children.append([None, [None]])

    # перебираем всех потомков на уровне по три
    prev_deps = ['BOS', ]
    for i in range(len(children) - 2):
        deprel = _update_(tuple(([item[0] for item in children[i:i+3]])))
        child = _update_(tuple(([item[1][0] for item in children[i:i+3]])))
        new_tree = children[i + 1][1]
        cur_dep = children[i + 1][0]

        # кладем на место все оставшиеся данные
        if deprel not in freq_dict[prev_node][prev_deprel][cur_node]:
            freq_dict[prev_node][prev_deprel][cur_node][deprel] = {-1: 0, }
        if child not in freq_dict[prev_node][prev_deprel][cur_node][deprel]:
            freq_dict[prev_node][prev_deprel][cur_node][deprel][child] = 0
        freq_dict[prev_node][prev_deprel][cur_node][deprel][-1] += 1
        freq_dict[prev_node][prev_deprel][cur_node][deprel][child] += 1

        _matrix_(cur_dep)
        prev_deps.append(cur_dep)

        # рекурсивно идем глубже по дереву
        if new_tree != [None, ]:
            freq_dict, freq_matrix, three_id, root_freq, deprel_freq = iter_tree(new_tree, cur_node, deprel, child,
                                                                                 freq_dict, freq_matrix, three_id,
                                                                                 root_freq, deprel_freq,
                                                                                 deprel_line + [deprel, ])
        else:
            deprel_freq = _deprel_(deprel_line, deprel_freq)

    _matrix_('EOS')

    return freq_dict, freq_matrix, three_id, root_freq, deprel_freq

In [None]:
def make_freq(data):
    """
    Внешняя функция получения статистики
    :param data: список распарсенных из conllu предложений
    :return: dict
    """
    deprel_id = {'root': 0, }
    cell_id = {Cell(): 0, }
    three_id = {(None, 0, None): 0, (None, None, None): 1}
    freq_dict = {}
    matrix = {}
    roots = {}
    deprel_freq = {}
    i = 0
    for tree in tqdm(data, total=len(data)):
        tree, cell_id, deprel_id = encode_nodes(tree, cell_id=cell_id, deprel_id=deprel_id)
        if len(tree) > 1:
          freq_dict, matrix, three_id, roots, deprel_freq = iter_tree(tree, freq_dict=freq_dict, freq_matrix=matrix,
                                                                    three_id=three_id, root_freq=roots,
                                                                    deprel_freq=deprel_freq, deprel_line=[])
        else:
          i += 1
    if i > 0:
      print('empty: ', i)
    return freq_dict, matrix, roots, three_id, cell_id, deprel_id, deprel_freq

In [None]:
def convert_for_json(three_id, cell_id, deprel_id):
    """
    Преобразует данные с кодами так, чтрбы их было удобно сохранить в json и потом использовать
    :param three_id: коды троек
    :param cell_id: коды ячеек
    :param deprel_id: коды связей
    :return: list
    """
    deprel_id = [key for key in sorted(deprel_id.keys(), key=lambda x: deprel_id[x])]
    cell_id = [key.to_dict() for key in sorted(cell_id.keys(), key=lambda x: cell_id[x])]
    three_id = [key for key in sorted(three_id.keys(), key=lambda x: three_id[x])]
    return three_id, cell_id, deprel_id

# Resave conllu > json

In [None]:
def get_tree_dct(sent, cur=0, cur_el=None):
    tree = []

    if cur == 0:  # если это первый элемент в предлодении, то превращаем предложение в формат {head: dependents}
        new_sent = {}
        for i in sent:
            if (i.get('head', 0) is not None) and (i['upos'] not in ['_', 'PUNCT']):
                if i.get('head', 0) not in new_sent:
                    new_sent[i.get('head', 0)] = []
                new_sent[i.get('head', 0)].append(i)
        sent = new_sent

    tree.append(make_cell(cur_el).to_dict())

    cur_child = sent.get(cur, [])
    for el in cur_child:  # для каждого элемента из потомков выполняем функцию рекурсивно
        connect = el['deprel']
        new_tree = get_tree_dct(sent, el['id'], el)
        tree.append([connect, new_tree])
    return tree

In [None]:
def resave_json(cur_dir, res_dir):
    with open(cur_dir, encoding='utf-8') as f:
        data = parse_conllu(f.read())
    new_data = []
    for el in tqdm(data, total=len(data)):
        new_data.append(get_tree_dct(el))
    print(len(new_data))
    with open(res_dir, 'w', encoding='utf-8') as f:
        json.dump(new_data, f)

In [None]:
def resave_json_all(path='/content/drive/MyDrive/trees'):
    names = ['foreign_love_stories', ]
    for name in names:
        print(name)
        cur_dir = f'{path}/rus/{name}_1m.conllu'
        res_dir = f'{path}/trees_data/rus/{name}.json'
        resave_json(cur_dir, res_dir)

In [None]:
# resave_json_all()

# Get verses

In [None]:
def get_vers_one(cur_dir, res_dir):
    names = ['freq', 'matrix', 'roots', 'three', 'cell', 'deprel', 'deprel_freq']
    with open(cur_dir, encoding='utf-8') as f:
        data = json.load(f)
    res = make_freq(data)
    res = res[0:3] + convert_for_json(*res[3:6]) + res[6:]
    try:
        os.mkdir(res_dir)
    except FileExistsError:
        pass
    for i in range(len(res)):
        s = f'{res_dir}/{names[i]}.json'
        save_json(s, res[i], cur_dir='', mode='w')

In [None]:
def get_vers_all(path='/content/drive/MyDrive/trees'):
    names = ['detective_for_kidds', 'detective_masters', 'fontanka', 'foreign_love_stories', 'habr', 'membrana']
    for name in names:
        print(name)
        cur_dir = f'{path}/trees_data/rus/{name}.json'
        res_dir = f'{path}/data/rus/{name}'

        get_vers_one(cur_dir, res_dir)

In [None]:
get_vers_all()

detective_for_kidds


100%|██████████| 81298/81298 [00:28<00:00, 2885.27it/s]


empty:  10
detective_masters


100%|██████████| 79159/79159 [00:28<00:00, 2733.49it/s]


empty:  9
fontanka


100%|██████████| 61190/61190 [00:26<00:00, 2268.20it/s]


empty:  3
foreign_love_stories


100%|██████████| 77625/77625 [00:25<00:00, 3084.98it/s]


empty:  25
habr


100%|██████████| 55675/55675 [00:31<00:00, 1795.07it/s]


empty:  28
membrana


100%|██████████| 44304/44304 [00:31<00:00, 1422.68it/s]
