In [50]:
import os
from collections import defaultdict
from pprint import pprint

import pandas as pd

with open(os.path.join('..', 'data/tesstrain/kbd/configs/kbd.wordlist'), 'r', encoding='utf-8') as f:
    existing_words = set(f.read().split('\n'))


def get_words_from_diff(pdf_dir, filter_prefix=None):
    interesting_words = set()

    cmpr_dirs = [d for d in os.listdir(pdf_dir) if d.startswith('cmpr_')]
    if filter_prefix:
        cmpr_dirs = [d for d in cmpr_dirs if d.startswith(filter_prefix)]

    diff_counts = defaultdict(int)
    for cmpr_dir in cmpr_dirs:
        df = pd.read_csv(os.path.join(pdf_dir, cmpr_dir, 'diff_words.csv'))

        for col in df.columns:
            col_values = set(df[col].dropna().values)
            diff_counts[cmpr_dir] = len(col_values)
            uniq_words = set(chain.from_iterable([col_value.split(' ') for col_value in col_values]))
            interesting_words.update(uniq_words)

    pprint(sorted(diff_counts.items(), key=lambda x: x[1], reverse=True))
    return interesting_words


def filter_out_words(words):
    return [word for word in words if word not in existing_words]


def filter_words(words):
    return [word for word in words if len(word) > 2 and word in existing_words]


In [51]:
pdf_dir = os.path.join('..', 'data/dag_results/pdf_processing/dysche_zhyg.pdf')
pdf_name = os.path.basename(pdf_dir).split('.')[0]

filter_prefix = None
interesting_words = get_words_from_diff(pdf_dir, filter_prefix=filter_prefix)
print(len(interesting_words))

interesting_words = filter_out_words(interesting_words)
not_interesting_words = filter_words(interesting_words)

[('cmpr_kbd_0.009_4360_66700_vs_ng_lines_1000_5_oshamaho_0.049_2738_19500',
  2072),
 ('cmpr_kbd_0.009_4360_66700_vs_ng_lines_1000_4_oshamaho_0.094_2811_17600',
  1935),
 ('cmpr_kbd_0.009_4360_66700_vs_40k_char_ngrams_2_40k_0.028_2149_16400', 1872),
 ('cmpr_kbd_0.009_4360_66700_vs_40k_char_ngrams_3_40k_0.018_2113_17300', 1825),
 ('cmpr_kbd_0.009_4360_66700_vs_low_confidence_lines_dysche_zhyg_0.010_1539_19100',
  1820),
 ('cmpr_kbd_0.009_4360_66700_vs_bigrams_freq_1_5000_hight_adyghepsale_ru_0.005_313_12400',
  1763),
 ('cmpr_kbd_0.009_4360_66700_vs_40k_char_ngrams_23_40k_0.011_139_9300', 1742),
 ('cmpr_kbd_0.009_4360_66700_vs_bigrams_freq_1_5000_adyghepsale_ru_0.009_127_8000',
  1668),
 ('cmpr_kbd_0.009_4360_66700_vs_interest_word_lines_dysche_zhyg_0.005_1472_14400',
  1660),
 ('cmpr_kbd_0.009_4360_66700_vs_bigrams_freq_5000_all_book_0.010_1140_5900',
  1647),
 ('cmpr_kbd_0.009_4360_66700_vs_40k_char_ngrams_32_40k_0.014_58_3100', 1647),
 ('cmpr_kbd_0.009_4360_66700_vs_interest_word_lin

In [52]:
import nltk
from itertools import chain

not_interesting_chars = set(chain.from_iterable([nltk.ngrams(line, 2) for line in not_interesting_words]))
interesting_chars = set(chain.from_iterable([nltk.ngrams(line, 2) for line in interesting_words]))
invalid_chars = interesting_chars - not_interesting_chars
print(invalid_chars)

{('И', 'Л'), ('Щ', 'Р'), ('н', 'ь'), ('Э', '5'), ('у', 'т'), ('х', 'т'), ('Ф', 'I'), ('я', 'I'), ('р', 'к'), ('щ', 'и'), ('ъ', 'т'), ('ф', 'ы'), ('С', 'Т'), ('о', 'ч'), ('т', 'ц'), ('и', '5'), ('с', 'к'), ('ш', 'о'), ('л', 'и'), ('б', 'ы'), ('Т', 'Ю'), ('Т', 'у'), ('З', 'а'), ('р', 'щ'), ('щ', 'Щ'), ('Я', 'I'), ('х', 'ъ'), ('у', 'ъ'), ('л', 'п'), ('Л', 'и'), ('ш', 'I'), ('с', 'щ'), ('ф', 'Р'), ('е', 'к'), ('м', 'ф'), ('л', 'Щ'), ('ъ', 'ъ'), ('к', 'а'), ('у', '2'), ('Э', 'у'), ('д', 'р'), ('Т', 'м'), ('и', 'у'), ('К', 'Щ'), ('М', 'д'), ('Д', 'Э'), ('э', 'Ъ'), ('р', 'о'), ('е', 'щ'), ('5', 'I'), ('Б', 'б'), ('Д', 'и'), ('я', 'н'), ('ж', 'Ё'), ('ь', 'р'), ('э', '3'), ('з', 'т'), ('а', 'н'), ('I', 'Ю'), ('р', 'I'), ('ю', 'ц'), ('Н', 'I'), ('1', 'Ш'), ('о', 'е'), ('э', 'Ф'), ('и', 'м'), ('с', 'I'), ('ы', 'ц'), ('н', 'я'), ('И', 'й'), ('Т', '1'), ('ж', 'с'), ('к', 'ж'), ('Е', 'А'), ('I', 'З'), ('Г', 'й'), ('I', 'м'), ('г', 'щ'), ('з', 'ъ'), ('о', 'г'), ('I', 'э'), ('е', 'I'), ('ь', 'В'), ('Й

In [53]:
invalid_chars = [''.join(char) for char in invalid_chars]
import json

json.dump(list(invalid_chars), open('invalid_chars.json', 'w'), indent=4, ensure_ascii=False)

In [54]:
from collections import Counter

cnt = Counter()
for word in interesting_words:
    cnt.update(nltk.ngrams(word, 2))

In [55]:
# most_common = [''.join(char) for char, _ in cnt.most_common(500)]
# json.dump(list(most_common), open('most_common_chars.json', 'w'), indent=4, ensure_ascii=False)

In [59]:
import os

from dags.src.text_cleaner import clean_text

input_dir = os.path.join('..', 'data/tesstrain/kbd/data/input')
output_dir = os.path.join('..', 'data/tesstrain/kbd/data/output')

file_path = os.path.join(input_dir, 'oshamaho.txt')
f_name = os.path.basename(file_path)

lines = []
with open(file_path, 'r', encoding='utf-8') as f:
    _lines = f.readlines()
    lines.extend(_lines)

lines = [clean_text(line) for line in lines]

In [66]:
import nltk
from tqdm import tqdm
from collections import Counter

cnt_from_orig = Counter()
for i, line in tqdm(enumerate(lines), total=len(lines)):
    cnt_from_orig.update(nltk.ngrams(line, 2))

100%|██████████| 368263/368263 [00:11<00:00, 32626.94it/s]


In [67]:
for ch, freq in sorted(cnt.items(), key=lambda x: x[1]):
    freq_from_orig = cnt_from_orig[ch]
    if freq >= freq_from_orig:
        print(ch, freq, freq_from_orig)

('ы', 'Ъ') 1 0
('Ь', 'ь') 1 0
('щ', 'ш') 1 0
('О', 'Ж') 1 0
('В', 'Е') 1 0
('к', 'К') 1 1
('Г', 'Р') 1 0
('С', 'с') 1 0
('и', 'П') 1 0
('7', 'Ж') 1 0
('и', 'Ж') 1 0
('1', 'И') 1 0
('И', 'А') 1 0
('ш', '2') 1 0
('Ч', '3') 1 0
('Г', 'т') 1 0
('щ', 'Ц') 1 0
('Щ', 'л') 1 0
('ж', 'я') 1 1
('Б', 'б') 1 0
('Ф', 'Х') 1 0
('Л', '1') 1 0
('И', 'а') 1 1
('э', 'Ш') 1 1
('и', 'С') 1 0
('О', 'Щ') 1 0
('Ж', 'Я') 1 0
('0', 'г') 1 0
('ш', 'Ц') 1 0
('т', 'Ё') 1 0
('_', 'М') 1 0
('я', '1') 1 0
('З', 'Ч') 1 0
('г', '1') 1 0
('5', 'л') 1 0
('ы', '0') 1 0
('Д', '5') 1 0
('о', 'Г') 1 0
('С', 'З') 1 0
('Х', 'Б') 1 0
('Ж', '6') 1 0
('6', 'э') 1 0
('м', '4') 1 1
('Ш', 'с') 1 0
('х', 'У') 1 0
('2', 'Щ') 1 0
('с', 'Ф') 1 0
('ж', 'ф') 1 0
('ы', 'Ж') 1 0
('И', 'Ы') 1 0
('м', 'О') 1 0
('ж', 'Э') 1 0
('ь', 'й') 1 0
('Щ', 'Е') 1 0
('Е', 'А') 1 0
('1', 'ы') 1 0
('У', '5') 1 0
('Е', 'Е') 1 0
('Т', 'ь') 1 0
('й', 'Й') 1 0
('Г', 'Ы') 1 0
('Э', 'I') 1 0
('т', 'Щ') 1 0
('Я', 'э') 1 0
('т', 'Ф') 1 0
('ц', 'Л') 1 0
('Т', 'с')