In [584]:

import re

direct_object_prefix = r'(?P<direct_object_prefix>у|б|ф|в|д|т|с|з|я|м|)'  # префикс прямого объекта
indirect_object_prefix = r'(?P<indirect_object_prefix>а|ы|э|е|)'  # префикс косвенного объекта
action_direction = r'(?P<action_direction>къу|къэ|къы|къ|ды|)'  # Направление действия
category1 = r'(?P<category1>щы|)'  # Категория 1 (например, определенная грамматическая категория)
version_category = r'(?P<version_category>хуа|хуэ|ху|)'  # Категория версии
indirect_object_prefix2 = r'(?P<indirect_object_prefix2>зы|зэ|зо|во|до|дэ|хэ|)'  # префикс косвенного объекта (повторно)
category2 = r'(?P<category2>рызэ|)'  # Категория 2
category3 = r'(?P<category3>ремы|ре|рызы|ры|рэ|рэ|ра|ро|)'  # Категория 3
affix1 = r'(?P<affix1>с|з|п|б|и|т|д|я|йа|а|е|)'  # префикс 1
version_category_repeat = r'(?P<version_category_repeat>хуа|хуэ|ху|)'  # Повторение категории версии
affix2 = r'(?P<affix2>хы|ха|x|)'  # префикс 2
version_category2 = r'(?P<version_category2>фӀэ|фI|ф|)'  # Вторая категория версии
category4 = r'(?P<category4>гъэ|)'  # Категория 4
affix3 = r'(?P<affix3>е|с|з|о|вэ|в|зы|ды|дэ|де|ди|да|до|т|)'  # префикс 3
subject_prefix = r'(?P<subject_prefix>з|б|и|д|в|я|йа|а|ри|ра|ре|ры|)'  # префикс субъекта
affix4 = r'(?P<affix4>ы|о|э|)'  # префикс 4
root_morpheme_0 = r'(?P<root_morpheme_0>.*?)'  # корневая морфема
negation_prefix = r'(?P<negation_prefix>мы|)'  # префикс отрицания
causative_prefix = r'(?P<causative_prefix>гъэ|)'  # каузативный префикс
root_morpheme_1 = r'(?P<root_morpheme_1>.*?)'  # корневая морфема
affix5 = r'(?P<affix5>хь|)'  # суффикс 5
returned_affix = r'(?P<returned_affix>ыж|жы|жа|ж|)'  # возвратный суффикс
affix7 = r'(?P<affix7>ы|э|а|)'  # суффикс 7
possibility_suffix = r'(?P<possibility_suffix>ф|)'  # суффикс возможности действия
affix8 = r'(?P<affix8>т|)'  # суффикс 8
past_tense_suffix = r'(?P<past_tense_suffix>ах|а|х|)'  # суффикс прошедшего времени
various_endings = r'(?P<various_endings>ын|эн|нщ|ащ|ущ|ыж|жы|эжу|эж|ат|эм|ар|ам|э)'
verb_2_noun_end = r'(?P<verb_2_noun_end>гъэ|гъ)'  # окончание глагола в существительное  
additional1 = r'(?P<additional1>ау|)'  # Дополнительная часть 1
affix9 = r'(?P<affix9>ы|у|а|)'  # Суффикс 9
additional2 = r'(?P<additional2>фы|)'  # Дополнительная часть 2
future_tense_suffix = r'(?P<future_tense_suffix>щ|р|м|)'  # суффикс будущего времени
large_affix = r'(?P<large_affix>шхуэ)'  # суффикс увеличения
absence_affix = r'(?P<absence_affix>ншэ)'  # суффикс отсутствия
multiple_affix = r'(?P<multiple_affix>хэ|)'  # множественный суффикс
good_affix = r'(?P<good_affix>фIэ)'  # суффикс отсутствия
head_affix = r'(?P<head_affix>щхьэ)'  # суффикс отсутствия
affix10 = r'(?P<affix10>ы|у|а|э|)'  # Суффикс 10
future_tense_suffix2 = r'(?P<future_tense_suffix2>ну|н|)'  # второй суффикс будущего времени
negation_form = r'(?P<negation_form>къым)'  # форма отрицания
additional3 = r'(?P<additional3>эрэ|ра|)'  # Дополнительная часть 3
affix11 = r'(?P<affix11>щ|р|м|т)'  # Суффикс 11
additional4 = r'(?P<additional4>кӀ|)'  # Дополнительная часть 4
affix12 = r'(?P<affix12>и|э|)'  # Суффикс 12
remaining_chars = r'(?P<remaining_chars>\w*)'  # оставшиеся символы

fallback_regexp = re.compile(
    r'(?P<direct_object_prefix>у|б|ф|в|д|т|с|з|я|м|)?'  # префикс прямого объекта
    r'(?P<indirect_object_prefix>а|ы|э|е|)?'  # префикс косвенного объекта
    r'(?P<action_direction>къу|къэ|къы|къ|ды|)?'  # Направление действия
    r'(?P<category1>щы|)?'  # Категория 1 (например, определенная грамматическая категория)
    r'(?P<version_category>хуа|хуэ|ху|)?'  # Категория версии
    r'(?P<indirect_object_prefix2>зы|зэ|зо|во|до|дэ|хэ|)?'  # префикс косвенного объекта (повторно)
    r'(?P<category2>рызэ|)?'  # Категория 2
    r'(?P<category3>ремы|ре|рызы|ры|рэ|ро|)?'  # Категория 3
    r'(?P<affix1>с|з|п|б|и|т|д|я|йа|а|е|)?'  # префикс 1
    r'(?P<version_category_repeat>хуа|хуэ|ху|)?'  # Повторение категории версии
    r'(?P<affix2>хы|ха|x|)?'  # префикс 2
    r'(?P<version_category2>фӀэ|фI|ф|)?'  # Вторая категория версии
    r'(?P<category4>гъэ|)?'  # Категория 4
    r'(?P<affix3>е|с|з|о|вэ|в|зы|ды|дэ|де|ди|да|до|т|)?'  # префикс 3
    r'(?P<subject_prefix>з|б|и|д|в|я|йа|а|ри|ра|ре|ры|)?'  # префикс субъекта
    r'(?P<affix4>ы|о|э|)?'  # префикс 4
    r'(?P<root_morpheme_0>.*?)'  # корневая морфема
    r'(?P<negation_prefix>мы|)?'  # префикс отрицания
    r'(?P<causative_prefix>гъэ|)?'  # каузативный префикс
    r'(?P<root_morpheme_1>.*?)'  # корневая морфема
    r'(?P<affix5>хь|)?'  # суффикс 5
    r'(?P<returned_affix>ыж|жы|жа|ж|)?'  # возвратный суффикс
    r'(?P<affix7>ы|э|а|)?'  # суффикс 7
    r'(?P<possibility_suffix>ф|)?'  # суффикс возможности действия
    r'(?P<affix8>т|)?'  # суффикс 8
    r'(?P<past_tense_suffix>ах|а|х|)?'  # суффикс прошедшего времени
    r'(?P<various_endings>ын|эн|нщ|ащ|ущ|ыж|жы|эжу|эж|ат|эм|ар|ам|э|гъэ|гъ)?'
    r'(?P<various_end>ын$|эн$|нщ$|ащ$|ущ$|ыж$|жы$|эжу$|эж$|ат$|эм$|ар$|ам$|э$|гъэ$|гъ$)?'
    r'(?P<additional1>ау|)?'  # Дополнительная часть 1
    r'(?P<affix9>ы|у|а|)?'  # Суффикс 9
    r'(?P<additional2>фы|)?'  # Дополнительная часть 2
    r'(?P<future_tense_suffix>щ|р|м|)?'  # суффикс будущего времени
    r'(?P<large_affix>шхуэ|)?'  # суффикс увеличения
    r'(?P<absence_affix>ншэ|)?'  # суффикс отсутствия
    r'(?P<multiple_affix>хэ|)?'  # множественный суффикс
    r'(?P<good_affix>фIэ|)?'  # суффикс отсутствия
    r'(?P<head_affix>щхьэ|)?'  # суффикс отсутствия
    r'(?P<affix10>ы|у|а|э|)?'  # Суффикс 10
    r'(?P<future_tense_suffix2>ну|н|)?'  # второй суффикс будущего времени
    r'(?P<negation_form>къым|)?'  # форма отрицания
    r'(?P<additional3>эрэ|ра|)?'  # Дополнительная часть 3
    r'(?P<affix11>щ|р|м|т|)?'  # Суффикс 11
    r'(?P<affix11_end>щ$|р$|м$|т$|)?'  # Суффикс 11
    r'(?P<additional4>кӀ|)?'  # Дополнительная часть 4
    r'(?P<affix12>и$|э$|)?'  # Суффикс 12
    # r'(?P<remaining_chars>\w*)'  # оставшиеся символы
)

ordered_part_name = [
    'direct_object_prefix',
    'indirect_object_prefix',
    'action_direction',
    'category1',
    'version_category',
    'indirect_object_prefix2',
    'category2',
    'category3',
    'affix1',
    'version_category_repeat',
    'affix2',
    'version_category2',
    'category4',
    'affix3',
    'subject_prefix',
    'affix4',
    'root_morpheme_0',
    'negation_prefix',
    'causative_prefix',
    'root_morpheme_1',
    'affix5',
    'returned_affix',
    'affix7',
    'possibility_suffix',
    'affix8',
    'past_tense_suffix',
    'various_endings',
    'verb_2_noun_end',
    'additional1',
    'affix9',
    'additional2',
    'future_tense_suffix',
    'large_affix',
    'absence_affix',
    'good_affix',
    'head_affix',
    'multiple_affix',
    'affix10',
    'future_tense_suffix2',
    'negation_form',
    'additional3',
    'affix11',
    'additional4',
    'affix12',
    'remaining_chars',
]


class KabardianStemmer:
    regexp_map = [
        ('diaf11', re.compile(
            direct_object_prefix + indirect_object_prefix + action_direction +
            root_morpheme_1 + future_tense_suffix2 + affix11 + '$'
        ), False),
        ('diar1112', re.compile(
            direct_object_prefix + indirect_object_prefix + action_direction +
            root_morpheme_1 + returned_affix + affix11 + affix12 + '$'
        ), False),
        ('lg', re.compile(
            root_morpheme_1 + large_affix + '$'
        ), False),
        ('rl11', re.compile(
            root_morpheme_1 + large_affix + affix11 + '$'
        ), False),
        ('abs', re.compile(
            root_morpheme_1 + absence_affix + '$'
        ), False),
        ('av_ng', re.compile(
            action_direction + version_category + affix1 + affix3 + root_morpheme_1 + negation_form + '$'
        ), False),
        ('a13ng', re.compile(
            action_direction + affix1 + affix3 + subject_prefix + root_morpheme_1 + negation_form + '$'
        ), False),
        ('810ng', re.compile(
            root_morpheme_1 + affix8 + affix10 + negation_form + '$'
        ), False),
        ('ng', re.compile(
            root_morpheme_1 + negation_form + '$'
        ), False),
        ('gm11', re.compile(
            root_morpheme_1 + good_affix + multiple_affix + affix11 + '$'
        ), False),
        ('v2nm11', re.compile(
            root_morpheme_1 + verb_2_noun_end + multiple_affix + affix11 + '$'
        ), False),
        ('am1112', re.compile(
            action_direction + root_morpheme_1 + multiple_affix + affix11 + affix12 + '$'
        ), False),
        ('m1112', re.compile(
            root_morpheme_1 + multiple_affix + affix11 + affix12 + '$'
        ), False),
        ('aim311', re.compile(
            action_direction + indirect_object_prefix2 + category3 + root_morpheme_1 + multiple_affix + affix11 + '$'
        ), False),
        ('dcm11', re.compile(
            direct_object_prefix + causative_prefix + root_morpheme_1 + multiple_affix + affix11 + '$'
        ), False),
        ('m11', re.compile(
            root_morpheme_1 + multiple_affix + affix11 + '$'
        ), False),
        ('a13c1011', re.compile(
            action_direction + affix1 + affix3 + subject_prefix + causative_prefix + root_morpheme_1 + affix10 + affix11 + '$'
        ), False),
        ('a131011', re.compile(
            action_direction + affix1 + affix3 + root_morpheme_1 + affix10 + affix11 + '$'
        ), False),
        ('a1011', re.compile(
            action_direction + root_morpheme_1 + affix10 + affix11 + '$'
        ), False),
        ('dnc10', re.compile(
            direct_object_prefix + negation_prefix + causative_prefix + root_morpheme_1 + affix10 + '$'
        ), False),
        ('din', re.compile(
            direct_object_prefix + indirect_object_prefix + root_morpheme_1 + negation_form + '$'
        ), False),
        ('dii9f10', re.compile(
            direct_object_prefix + indirect_object_prefix + indirect_object_prefix2 + root_morpheme_1 + affix9
            + future_tense_suffix + affix10 + '$'
        ), False),
        ('dii10', re.compile(
            direct_object_prefix + indirect_object_prefix + indirect_object_prefix2 + root_morpheme_1 + affix10 + '$'
        ), False),
        ('di10', re.compile(
            direct_object_prefix + indirect_object_prefix + root_morpheme_1 + affix10 + '$'
        ), False),
        ('di11', re.compile(
            direct_object_prefix + indirect_object_prefix + root_morpheme_1 + affix11 + '$'
        ), False),
        ('ca11', re.compile(
            category1 + affix1 + root_morpheme_1 + affix11 + '$'
        ), False),
        ('cr11', re.compile(
            root_morpheme_1 + returned_affix + affix11 + '$'
        ), False),
        ('cg11', re.compile(
            root_morpheme_1 + good_affix + affix11 + '$'
        ), False),
        ('h11', re.compile(
            root_morpheme_1 + head_affix + affix11 + '$'
        ), False),

        # ('c11', re.compile(
        #     category1 + root_morpheme_1 + r'(?P<affix11>щ|р|м|т)' + '$'
        # )),
        # ('11', re.compile(
        #     root_morpheme_1 + affix11 + '$'
        # )),
        # ('3', re.compile(
        #     indirect_object_prefix + category4 + root_morpheme_1 + various_endings + '$'
        # )),
    ]

    def __init__(self, min_len=4, group_root_index=18):
        self.min_len = min_len
        self.group_root_index = group_root_index

    def get_groups(self, word, skip_disabled=False):
        word = word.lower().replace('i', 'I')
        if len(word) < self.min_len:
            return None, None, 'too_short'

        name, match = None, None
        for name, regexp, is_enable in self.regexp_map:
            if skip_disabled and not is_enable:
                continue

            match = regexp.match(word)
            if match and all(match.groups()):
                break

        if not match:
            match = fallback_regexp.match(word)
            if match:
                part_joined = ':'.join([part for part in ordered_part_name if match.groupdict().get(part)])
                name = f'all_part:{part_joined}'

        if not match:
            return None, None, name

        return match.groups(), match.groupdict(), name

    def stem(self, word):
        groups, groupdict, name = self.get_groups(word)
        if not groups:
            return word, None

        root_0 = groupdict.get('root_morpheme_0') or ''
        root_1 = groupdict.get('root_morpheme_1') or ''
        root = root_0 + root_1

        negation_prefix = groupdict.get('negation_prefix')
        if negation_prefix:
            print('negation_prefix', negation_prefix, groups[:groups.index(negation_prefix)])
            return ''.join(groups[:groups.index(negation_prefix)]) + root
        elif len(root) >= self.min_len:
            return root
        else:
            return self.complicate(root, groups)

    def complicate(self, stem, groups):
        groups_list = list(groups)

        right_affix = self._process_affixes(groups_list, "right")
        left_affix = self._process_affixes(groups_list, "left")

        complicate_stem = (left_affix or '') + stem + (right_affix or '')
        return complicate_stem if len(complicate_stem) >= self.min_len else stem

    def _process_affixes(self, groups_list, direction):
        if direction == "right":
            start, end, step = self.group_root_index + 1, len(groups_list), 1
        else:  # direction == "left"
            start, end, step = self.group_root_index - 1, 0, -1

        for index in range(start, end, step):
            if groups_list[index]:
                groups_list[index] = None
                return groups_list[index]

        return None


In [585]:
from pprint import pprint

stemmer = KabardianStemmer()
pprint(stemmer.regexp_map)

q = 'Бгырыпхым'
stemmer.get_groups(q.replace(',', ''))

[('diaf11',
  re.compile('(?P<direct_object_prefix>у|б|ф|в|д|т|с|з|я|м|)(?P<indirect_object_prefix>а|ы|э|е|)(?P<action_direction>къу|къэ|къы|къ|ды|)(?P<root_morpheme_1>.*?)(?P<future_tense_suffix2>ну|н|)(?P<affix11>щ|р|м|т)$'),
  False),
 ('diar1112',
  re.compile('(?P<direct_object_prefix>у|б|ф|в|д|т|с|з|я|м|)(?P<indirect_object_prefix>а|ы|э|е|)(?P<action_direction>къу|къэ|къы|къ|ды|)(?P<root_morpheme_1>.*?)(?P<returned_affix>ыж|жы|жа|ж|)(?P<affix11>щ|р|м|т)(?),
  False),
 ('lg', re.compile('(?P<root_morpheme_1>.*?)(?P<large_affix>шхуэ)$'), False),
 ('rl11',
  re.compile('(?P<root_morpheme_1>.*?)(?P<large_affix>шхуэ)(?P<affix11>щ|р|м|т)$'),
  False),
 ('abs', re.compile('(?P<root_morpheme_1>.*?)(?P<absence_affix>ншэ)$'), False),
 ('av_ng',
  re.compile('(?P<action_direction>къу|къэ|къы|къ|ды|)(?P<version_category>хуа|хуэ|ху|)(?P<affix1>с|з|п|б|и|т|д|я|йа|а|е|)(?P<affix3>е|с|з|о|вэ|в|зы|ды|дэ|де|ди|да|до|т|)(?P<root_morpheme_1>.*?)(?P<negation_form>к),
  False),
 ('a13ng',
  re.compile

(('б',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  None,
  None,
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  ''),
 {'direct_object_prefix': 'б',
  'indirect_object_prefix': '',
  'action_direction': '',
  'category1': '',
  'version_category': '',
  'indirect_object_prefix2': '',
  'category2': '',
  'category3': '',
  'affix1': '',
  'version_category_repeat': '',
  'affix2': '',
  'version_category2': '',
  'category4': '',
  'affix3': '',
  'subject_prefix': '',
  'affix4': '',
  'root_morpheme_0': '',
  'negation_prefix': '',
  'causative_prefix': '',
  'root_morpheme_1': '',
  'affix5': '',
  'returned_affix': '',
  'affix7': '',
  'possibility_suffix': '',
  'affix8': '',
  'past_tense_suffix': '',
  'various_endings': None,
  'various_end': None,
  'additional1': '',
  'affix9': '',
  'additional2': '',
  'future_tense_

In [574]:
import pandas as pd

with open('../data/tesstrain/kbd/configs/kbd.wordlist', 'r') as f:
    words = f.read().split('\n')

words_grouped = [
    {name: None for name in ordered_part_name}
]

for word in sorted(words):
    groups, groupdict, name = stemmer.get_groups(word)
    if groups:
        data = dict(
            word=word,
            name=name,
            **groupdict
        )
        words_grouped.append(data)

df = pd.DataFrame(words_grouped)
# special columns
df = df[['word'] + ordered_part_name + ['name']]

In [575]:
df.to_csv('../data/tesstrain/kbd/configs/kbd.wordlist.csv', index=False)