In [None]:
import os
import re
import pandas as pd
import string
import pymorphy2
import nltk
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk.probability import FreqDist
from pymorphy2 import tokenizers
from wordcloud import WordCloud

nltk.download('punkt')
MORPH = pymorphy2.MorphAnalyzer()

# Загрузка данных

In [None]:
with open('pushkin-metel.txt', 'r', encoding='utf-8') as file:
    text = file.read()
print('total symbols:', len(text))
print('sample of text:', text[2200:2500])

# Предварительная обработка

In [None]:
def preprocessing(text):
    for ch in ['\n', '\t', '\r']:
        text = text.replace(ch, ' ')
    result = re.sub('[^а-яА-Яa-zA-Z]+', ' ', text).strip().lower()
    result = re.sub('ё', 'е', result)
    return result

In [None]:
text = preprocessing(text)
print('total symbols:', len(text))
print('sample of text:', text[2200:2500])

# Глубокая обработка текста

Приведем слова к [нормальной форме](https://ru.wikipedia.org/wiki/Лемматизация):

In [None]:
def advprocessing(text):
    funсtion_words = {'INTJ', 'PRCL', 'CONJ', 'PREP'}
    lemmatized_words = list(map(lambda word: MORPH.parse(word)[0], text.split()))
    result = []
    for word in lemmatized_words:
        if word.tag.POS not in funсtion_words:
            result.append(word.normal_form)
    return result, ' '.join(result)

In [None]:
text_tokens, text = advprocessing(text)
print('total symbols:', len(text))
print('total words:', len(text_tokens))
print('sample of text:', text[2200:2500])
print('sample of text tokens:', text_tokens[:50])

# Визуализация результатов

In [None]:
freq_dist = FreqDist(text_tokens)
freq_dist

In [None]:
print('most common 10 words:', freq_dist.most_common(10))

In [None]:
plt.figure(figsize=(16, 8))
plt.title('50 самых частых слов в тексте')
freq_dist.plot(50, cumulative=False)
plt.show()

In [None]:
wordcloud = WordCloud(background_color='white').generate(text)

In [None]:
plt.figure(figsize=(16, 8))
plt.axis('off')
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()