In [1]:
import os
import re

from tokenizers import Tokenizer
tokenizer_uni = Tokenizer.from_file(
    os.path.join('../dags/src/spellcheck/data/', 'tokenizer_unigram_5k.json')
)

In [2]:
with open('../data/tesstrain/kbd/configs/kbd.wordlist', 'r') as f:
    words = f.read().split('\n')

In [3]:
from collections import defaultdict

data = []

regexps = defaultdict(set)

tokenized_words = set()

for word in sorted(words):
    tokens = tokenizer_uni.encode(word).tokens
    token_ids = [tokenizer_uni.token_to_id(token) for token in tokens]
    
    for limit in [100, 200, 300, 500, 1000]:
        filtered_tokens = []
        for token_id in token_ids:
            if not token_id or token_id > limit:
                filtered_tokens.append(f'*')
            else:
                filtered_tokens.append(tokenizer_uni.id_to_token(token_id))
        
        reg = '|'.join(filtered_tokens)
        tokenized_word = '|'.join(tokens)
        tokenized_word_ids = '|'.join([str(token_id) for token_id in token_ids])
        
        tokenized_words.add(tokenized_word)
        regexps[reg].add((tokenized_word, tokenized_word_ids))
        
        
tokenized_text = '\n'.join(sorted(tokenized_words))
with open('../data/tesstrain/kbd/tokenized_text.txt', 'w') as f:
    f.write(tokenized_text)

In [4]:
import nltk
from collections import Counter

cnt = Counter()

for word in tokenized_words:
    cnt.update(nltk.ngrams(word.split('|'), 2))

In [5]:
cnt.most_common(100)

[(('къы', 'зэ', 'ры'), 4222),
 (('щ', 'хь', 'э'), 3560),
 (('лъ', 'агъ', 'у'), 2523),
 (('хэ', 'м', 'рэ'), 2003),
 (('тI', 'ы', 'с'), 1955),
 (('л', 'эж', 'ь'), 1789),
 (('щI', 'ы', 'хь'), 1752),
 (('п', 'лъ', 'э'), 1642),
 (('цI', 'ы', 'ху'), 1639),
 (('у', 'п', 'щI'), 1529),
 (('зэ', 'ры', 'зэ'), 1500),
 (('щI', 'э', 'у'), 1462),
 (('ы', 'с', 'хь'), 1403),
 (('хь', 'э', 'у'), 1390),
 (('къы', 'щI', 'э'), 1357),
 (('б', 'гъэ', 'дэ'), 1344),
 (('э', 'хэ', 'м'), 1326),
 (('гъэ', 'хь', 'э'), 1257),
 (('къ', 'и', 'гъэ'), 1236),
 (('т', 'хь', 'э'), 1209),
 (('ын', 'у', 'р'), 1196),
 (('гъэ', 'кIу', 'э'), 1189),
 (('гъэ', 'тI', 'ы'), 1167),
 (('эж', 'ын', 'у'), 1159),
 (('эн', 'у', 'р'), 1115),
 (('зэ', 'ры', 'щы'), 1108),
 (('зэ', 'щI', 'э'), 1081),
 (('лъ', 'ы', 'хь'), 1079),
 (('зэ', 'фI', 'э'), 1070),
 (('э', 'хэ', 'р'), 1063),
 (('кI', 'ын', 'у'), 1040),
 (('Iэ', 'щI', 'э'), 1033),
 (('лъ', 'хь', 'э'), 1031),
 (('хь', 'э', 'лI'), 1027),
 (('щI', 'э', 'кI'), 997),
 (('ху', 'и', 'гъэ'), 

In [4]:
filtered_regexps = {
    k: v for k, v in regexps.items() if k.count('*') <= 2 and 10 < len(v) < 20
}

In [5]:
os.makedirs(f'../data/tesstrain/kbd/tokens/', exist_ok=True)
for k, v in sorted(filtered_regexps.items(), key=lambda x: len(x[1]), reverse=True):
    wildcards = k.count('*')
    name = f'({wildcards}){k.replace("*", "_")}({len(v)}).txt'
    with open(f'../data/tesstrain/kbd/tokens/{name}', 'w') as f:
        for tokens, token_ids in v:
            f.write(f'{tokens}\t{token_ids}\n')

In [6]:
def build_regexp(key, tokens):
    groups = key.split('|')  # Разделяем шаблон на группы
    pattern_parts = []

    token_split = [token.split('|') for token in tokens]
    
    for index, group in enumerate(groups):
        if group == '*':
            tokens = '|'.join([token[index] for token in token_split])
            if tokens.count('|') < 15:
                pattern_parts.append(f'(?P<g_{index}>{tokens})')  # Именованный захват для *
            else:
                pattern_parts.append(f'(?P<g_{index}>.+)')  # Именованный захват для *
        else:
            pattern_parts.append(f'(?P<gf_{index}>' + group + ')')  # Именованный захват для остальных групп
    
    pattern = '\\|'.join(pattern_parts)  # Собираем шаблон обратно
    r = re.compile(pattern)
    return r

regexps_words = defaultdict(set)
for k, v in sorted(filtered_regexps.items(), key=lambda x: len(x[1]), reverse=True):
    tokens = [v[0] for v in v]
    r = build_regexp(k, tokens)
    for m in r.finditer(tokenized_text):
        if m:
            gd = m.groupdict()
            extracted = ''.join([gd[f'g_{i}'] for i in range(len(gd)) if f'g_{i}' in gd])
            regexps_words[r].add(k)