In [1]:
from dags.src.text_cleaner import clean_text

In [2]:
from collections import Counter
import os
import nltk
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

nltk.download("stopwords")

input_dir = os.path.join("..", "data/tesstrain/kbd/data/input")
output_dir = os.path.join("..", "data/tesstrain/kbd/data/output")

file_path = os.path.join(input_dir, "all_book.txt")
f_name = os.path.basename(file_path)

BUF_SIZE = 100000


def process_chunk(chunk, collocation_limit=2000):
    chunk = clean_text(chunk)
    tokens = nltk.word_tokenize(chunk)
    text = nltk.Text(tokens)
    clc = text.collocation_list(collocation_limit)
    return clc


def get_chunks(file_path):
    chunks = []

    with open(file_path) as f:
        while True:
            chunk = f.read(BUF_SIZE)
            if not chunk:
                break
            chunks.append(chunk)

    return chunks


def calc_bigrams(file_path, par_factor=4, chunk_limit=1000):
    cnt = Counter()

    chunks = get_chunks(file_path)
    chunks = chunks[:chunk_limit]
    print(f"Число чанков: {len(chunks)}")

    with tqdm(total=len(chunks)) as pbar, ThreadPoolExecutor(max_workers=par_factor) as executor:
        futures = [executor.submit(process_chunk, chunk) for chunk in chunks]
        for future in concurrent.futures.as_completed(futures):
            try:
                clc = future.result()
                cnt.update(clc)
                pbar.update(1)
            except Exception as e:
                print(f"Ошибка: {e}")

    return cnt

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/panagoa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
limit = 5000

cnt = calc_bigrams(file_path, par_factor=4)

bigrams_out_path = os.path.join(output_dir, f"bigrams_freq_{limit}_{f_name}")
with open(bigrams_out_path, "w") as f:
    bgrm_str = "\n".join([" ".join(b) for b, frq in cnt.most_common(limit)])
    f.write(bgrm_str)

Число чанков: 309


  0%|          | 0/309 [00:00<?, ?it/s]

Ошибка: 'WordListCorpusReader' object has no attribute '_LazyCorpusLoader__args'
Ошибка: 'WordListCorpusReader' object has no attribute '_LazyCorpusLoader__reader_cls'


 99%|█████████▉| 307/309 [00:11<00:00, 25.84it/s] 


In [4]:
cnt.most_common(100)

[(('нэхърэ', 'нэхъ'), 209),
 (('дэнэ', 'къэна'), 179),
 (('сыт', 'хуэдэ'), 156),
 (('сыт', 'щыгъуи'), 118),
 (('иджыри', 'къэс'), 113),
 (('абы', 'щхьэкэ'), 82),
 (('Ауэ', 'абы'), 79),
 (('куэд', 'щауэ'), 79),
 (('ауэ', 'сытми'), 76),
 (('абы', 'щыгъуэ'), 76),
 (('махуэ', 'къэс'), 75),
 (('эмал', 'имыэу'), 72),
 (('Степан', 'Ильич'), 67),
 (('щалэ', 'цыку'), 66),
 (('сыту', 'жыпэмэ'), 66),
 (('Нал', 'къута'), 66),
 (('Абы', 'щыгъуэ'), 65),
 (('Мазэ', 'ныкъуэ'), 65),
 (('Хъуэпсэгъуэ', 'нур'), 64),
 (('ныкъуэ', 'щхъуантIэ'), 64),
 (('Сыт', 'хуэдэ'), 63),
 (('Вагъуэ', 'махуэ'), 63),
 (('Мывэ', 'хуабэ'), 63),
 (('сыт', 'щхьэкэ'), 62),
 (('Бабыщыкъуэ', 'адакъэпщ'), 62),
 (('щалэ', 'цыкур'), 61),
 (('Кхъухь', 'пхэнж'), 61),
 (('лъапэхэм', 'деж'), 61),
 (('Бгы', 'лъапэхэм'), 61),
 (('лъэрыгъыпс', 'тIыгъа'), 60),
 (('абы', 'хуэдэ'), 59),
 (('япэ', 'дыдэ'), 58),
 (('жыхуаэм', 'хуэдэу'), 58),
 (('Бдзэжьеящэм', 'ипхъу'), 58),
 (('щалэ', 'цыкум'), 57),
 (('ужь', 'иту'), 57),
 (('псом', 'хуэмыдэу')