In [None]:
from natasha import (
    Segmenter, MorphVocab,
    NewsNERTagger,
    NewsEmbedding,
    NewsMorphTagger,    
    Doc
)
import re
import glob
import tqdm

In [None]:
def poem(text):
    lines = text.split('\n')
    if len(lines):
        not_poem_probability = sum([line[0].islower() for line in lines  if line]) / len(lines)
        return not_poem_probability < 0.3
    return False

In [None]:
def prepare_russian_text(input_file, output_file):
    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    ner_tagger = NewsNERTagger(emb)
    morph_vocab = MorphVocab()

    label_dict = {'NUM': 'ordinal1', 'PRON': 'pron1', 'PER': 'person1'}
    next_label_num = 5

    with open(input_file) as fin:
        raw_text = ' '.join(fin.readlines()).replace('\n', ' ')

    raw_text = re.sub(r'\d+', '0' , raw_text)

    doc = Doc(raw_text)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger)
    
    for span in reversed(doc.ner.spans):
        if span.type not in label_dict:
            label_dict[span.type] = str(next_label_num)
            next_label_num += 1
        raw_text = "".join((raw_text[:span.start], label_dict[span.type], raw_text[span.stop:]))

    doc = Doc(raw_text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
  
    prepared_text = ''
    prev_num = False
    for token in doc.tokens:
      
        if token.pos == 'NUM' and not token.text.isdigit():
            if not prev_num:
                prepared_text += '0'
                prepared_text += ' '
                prev_num = True
            continue

        prev_num = False

        if token.pos in label_dict:
            prepared_text += label_dict[token.pos]
            prepared_text += ' '
            
        elif token.pos != 'PUNCT':
                try:
                    token.lemmatize(morph_vocab)
                    prepared_text += token.lemma.lower()
                    prepared_text += ' '
                except Exception as ex:
                    prepared_text += token.text.lower()
                    prepared_text += ' '
                    
    with open(output_file, 'w') as fout:
        fout.write(prepared_text)

In [None]:
def prepare_russian_texts_from_folder(input_folder, output_folder):
    files = sorted(glob.glob(input_folder + '/*'))
    for file in tqdm.tqdm(files):
        output_file = output_folder + '/PREPROCESSED_' + file.split('/')[-1]
        prepare_russian_text(file, output_file)

In [None]:
prepare_russian_texts_from_folder('raw_russian_texts', 'preprocessed_russian_texts')