In [1]:
import locale
from collections import defaultdict
from pathlib import Path

locale.setlocale(locale.LC_ALL, '')
dir_name = 'annotations'

**CAUTION**: This will modify the existing files

Sort the content of each annotation file in `dir_name` directory.

In [2]:
def pretty_sort(rows):
    """
    rows = e.g. ["katajn\tkat'ajn", "katidajn\tkat'id'ajn", ...]
    """
    result = []

    def sort(rows, index=0):
        morph2rows_end = defaultdict(list)
        morph2rows_next = defaultdict(list)
        for row in rows:
            annot = row.split('\t')[1].split("'")
            if index != 0 and len(annot) <= index + 1:
                morph = '-' if len(annot) < index + 1 else annot[index]
                morph2rows_end[morph].append(row)
            else:
                morph2rows_next[annot[index]].append(row)

        for m in sorted(list(morph2rows_end.keys()), key=locale.strxfrm):
            result.extend(sorted(morph2rows_end[m], key=locale.strxfrm))

        for m in sorted(list(morph2rows_next.keys()), key=locale.strxfrm):
            sort(morph2rows_next[m], index + 1)

    sort(rows)

    return result


sep2cat = {
    "'o'j": "'oj",
    "'o'j'n": "'ojn",
    "'o'n": "'on",
    "'a'j": "'aj",
    "'a'j'n": "'ajn",
    "'a'n": "'an",
    "'e'n": "'en"}


for filename in Path(dir_name).glob('*.txt'):
    with open(filename, encoding='utf-8') as f:
        text = f.read()

    rows = []
    for row in text.split('\n'):
        for sep, cat in sep2cat.items():
            if row.endswith(sep):
                row = row[:-len(sep)] + cat
        rows.append(row)

    sorted_rows = pretty_sort(rows)

    result_rows = []
    for row in sorted_rows:
        for sep, cat in sep2cat.items():
            if row.endswith(cat):
                row = row[:-len(cat)] + sep
        result_rows.append(row)

    with open(filename, 'w', encoding='utf-8') as f:
        f.write('\n'.join(result_rows))