In [52]:
from dags.src.text_cleaner import clean_text
import os
import re

input_dir = os.path.join("..", "data/tesstrain/kbd/data/input")
output_dir = os.path.join("..", "data/tesstrain/kbd/data/output")

file_path = os.path.join(input_dir, "oshamaho.txt")
f_name = os.path.basename(file_path)

lines = []
with open(file_path, "r", encoding="utf-8") as f:
    _lines = f.readlines()
    lines.extend(_lines)

lines = [line.strip() for line in set(lines) if len(line.strip()) > 20]
lines = [clean_text(line) for line in lines]

In [53]:
with open(os.path.join("..", "data/tesstrain/kbd/configs/kbd.wordlist"), "r", encoding="utf-8") as f:
    existing_words = set(f.read().split("\n"))

In [54]:
from pprint import pprint
from collections import defaultdict
import random
import pandas as pd
from itertools import chain
import csv


def get_words_from_diff(pdf_dir, filter_prefix=None):
    interesting_words = set()

    cmpr_dirs = [d for d in os.listdir(pdf_dir) if d.startswith("cmpr_")]
    if filter_prefix:
        cmpr_dirs = [d for d in cmpr_dirs if d.startswith(filter_prefix)]

    diff_counts = defaultdict(int)
    for cmpr_dir in cmpr_dirs:
        df = pd.read_csv(os.path.join(pdf_dir, cmpr_dir, "diff_words.csv"))

        for col in df.columns:
            col_values = set(df[col].dropna().values)
            diff_counts[cmpr_dir] = len(col_values)
            uniq_words = set(chain.from_iterable([col_value.split(" ") for col_value in col_values]))
            interesting_words.update(uniq_words)

    pprint(sorted(diff_counts.items(), key=lambda x: x[1], reverse=True))
    return interesting_words


def get_words_with_low_confidence(pdf_dir, prefix="filtered_0_30"):
    interesting_words = set()

    cmpr_dirs = [d for d in os.listdir(pdf_dir) if d.startswith("cmpr_")]
    for cmpr_dir in cmpr_dirs:
        for low_conf_csv in os.listdir(os.path.join(pdf_dir, cmpr_dir)):
            if low_conf_csv.startswith(prefix):
                df = pd.read_csv(
                    os.path.join(pdf_dir, cmpr_dir, low_conf_csv),
                    sep="\t",
                    quoting=csv.QUOTE_NONE,
                    encoding="utf-8",
                )
                col_values = set(df["text"].dropna().values)
                interesting_words.update(col_values)

    return interesting_words


def filter_words(words):
    return [word for word in words if len(word) > 2 and word in existing_words]

In [68]:
pdf_dir = os.path.join("..", "data/dag_results/pdf_processing/dysche_zhyg.pdf")
pdf_name = os.path.basename(pdf_dir).split(".")[0]

filter_prefix = None
interesting_words = get_words_from_diff(pdf_dir, filter_prefix=filter_prefix)
print(len(interesting_words))
filtered_words = filter_words(interesting_words)
print(len(filtered_words))

filter_prefix = filter_prefix or ""

with open(os.path.join(output_dir, f"interesting_words_{filter_prefix}_{pdf_name}.txt"), "w", encoding="utf-8") as f:
    f.write("\n".join(sorted(filtered_words)))

[('cmpr_kbd_0.009_4360_66700_vs_confidence_filtered_0_60_lines_dysche_zhyg_0.023_1530_15000',
  2104),
 ('cmpr_kbd_0.009_4360_66700_vs_ng_lines_1000_5_oshamaho_0.049_2738_19500',
  2072),
 ('cmpr_kbd_0.009_4360_66700_vs_kbd_font_helvetica_0.010_1975_12100', 2024),
 ('cmpr_kbd_0.009_4360_66700_vs_ng_lines_1000_4_oshamaho_0.094_2811_17600',
  1935),
 ('cmpr_kbd_0.009_4360_66700_vs_40k_char_ngrams_2_40k_0.028_2149_16400', 1872),
 ('cmpr_kbd_0.009_4360_66700_vs_40k_char_ngrams_3_40k_0.018_2113_17300', 1825),
 ('cmpr_kbd_0.009_4360_66700_vs_low_confidence_lines_dysche_zhyg_0.010_1539_19100',
  1820),
 ('cmpr_kbd_0.009_4360_66700_vs_bigrams_freq_1_5000_hight_adyghepsale_ru_0.005_313_12400',
  1763),
 ('cmpr_kbd_0.009_4360_66700_vs_bigrams_freq_1_5000_adyghepsale_ru_0.009_127_8000',
  1668),
 ('cmpr_kbd_0.009_4360_66700_vs_interest_word_lines_dysche_zhyg_0.005_1472_14400',
  1660),
 ('cmpr_kbd_0.009_4360_66700_vs_bigrams_freq_5000_all_book_0.010_1140_5900',
  1647),
 ('cmpr_kbd_0.009_4360_667

In [69]:
def create_lines(words, word_count_for_line=5):
    _words = words.copy()
    limit = len(_words) // word_count_for_line

    lines = []
    for _ in range(limit):
        line_words = []
        for _ in range(word_count_for_line):
            if len(_words) == 0:
                break

            rnd_word = random.choice(_words)
            _words.remove(rnd_word)
            line_words.append(rnd_word)

        lines.append(" ".join(line_words))

    return lines

In [70]:
lines = []
for _ in range(5):
    lines.extend(create_lines(filtered_words))

print(len(lines))
with open(os.path.join(output_dir, f"interest_word_lines_{filter_prefix}_{pdf_name}.txt"), "w", encoding="utf-8") as f:
    f.write("\n".join(sorted(lines)))

6350


In [50]:
pdf_dir = os.path.join("..", "data/dag_results/pdf_processing/dysche_zhyg.pdf")
prefix = "filtered_0_60"
words = get_words_with_low_confidence(pdf_dir, prefix=prefix)
filtered_words = filter_words(words)
lines = create_lines(filtered_words)
with open(os.path.join(output_dir, f"confidence_{prefix}_lines_{pdf_name}.txt"), "w", encoding="utf-8") as f:
    f.write("\n".join(sorted(lines)))

In [71]:
interesting_words = get_words_from_diff(pdf_dir, filter_prefix=None)

[('cmpr_kbd_0.009_4360_66700_vs_40k_char_ngrams_1_40k_TNR_0.041_315_8400',
  2765),
 ('cmpr_kbd_0.009_4360_66700_vs_confidence_filtered_0_60_lines_dysche_zhyg_0.023_1530_15000',
  2104),
 ('cmpr_kbd_0.009_4360_66700_vs_ng_lines_1000_5_oshamaho_0.049_2738_19500',
  2072),
 ('cmpr_kbd_0.009_4360_66700_vs_kbd_font_helvetica_0.010_1975_12100', 2024),
 ('cmpr_kbd_0.009_4360_66700_vs_ng_lines_1000_4_oshamaho_0.094_2811_17600',
  1935),
 ('cmpr_kbd_0.009_4360_66700_vs_40k_char_ngrams_2_40k_0.028_2149_16400', 1872),
 ('cmpr_kbd_0.009_4360_66700_vs_40k_char_ngrams_3_40k_0.018_2113_17300', 1825),
 ('cmpr_kbd_0.009_4360_66700_vs_low_confidence_lines_dysche_zhyg_0.010_1539_19100',
  1820),
 ('cmpr_kbd_0.009_4360_66700_vs_bigrams_freq_1_5000_hight_adyghepsale_ru_0.005_313_12400',
  1763),
 ('cmpr_kbd_0.009_4360_66700_vs_40k_char_ngrams_23_40k_0.011_139_9300', 1742),
 ('cmpr_kbd_0.009_4360_66700_vs_bigrams_freq_1_5000_adyghepsale_ru_0.009_127_8000',
  1668),
 ('cmpr_kbd_0.009_4360_66700_vs_interest_