<a href="https://colab.research.google.com/github/tyugv/text_clusterization/blob/main/Text_clusterization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pymorphy2
!pip install pyLDAvis

In [None]:
import nltk
import gensim
import pandas as pd
import re
import pymorphy2
import os
import pickle
import pyLDAvis
from gensim.models.phrases import Phrases
from pyLDAvis import gensim_models
from tqdm.notebook import tqdm_notebook

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UserWarning)

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
class Pipeline:
  def __init__(self, stopwords = None):
    if not stopwords:
      stopwords = nltk.corpus.stopwords.words('russian')

    self.stopwords = stopwords
    self.morph = pymorphy2.MorphAnalyzer()

    pyLDAvis.enable_notebook()
    if 'ldavis_prepared' not in os.listdir():
      os.mkdir('ldavis_prepared')
    self.LDAvis_data_filepath = os.path.join('ldavis_prepared', 'ldavis_prepared')
  
  def preprocess(self, text: str):
    text = re.sub("\d+", "", text)
    tokenized = nltk.word_tokenize(text, language="russian")
    preprocessed = []
    for word in tokenized:
      word = self.morph.parse(re.sub(r'[^\w]', '', word))[0].normal_form
      if word != '' and word not in self.stopwords:
        preprocessed.append(word)
    return preprocessed

  @staticmethod
  def generate_N_grams(reviews, N = 1, min_count = 1):
    if N > 1:
      reviews_ngram = reviews.copy()
      for _ in range(N-1):
        phrases = Phrases(reviews_ngram, min_count=min_count)
        reviews_ngram = [phrases[review] for review in reviews_ngram]
      only_ngrams = [[token for token in review  if '_' in token] for review in reviews_ngram]
      return only_ngrams
    return reviews

  @staticmethod
  def make_dictionary(reviews: list):
      return gensim.corpora.Dictionary(reviews)

  @staticmethod
  def make_corpus(dictionary, reviews: list):
      return [dictionary.doc2bow(text) for text in reviews]

  @staticmethod
  def lda_model(corpus, dictionary, num_topics: int = 10):
    return gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)

  def make_vizualization(self, corpus, dictionary, lda_model, filename=None):
    filename = filename or self.LDAvis_data_filepath
    LDAvis_prepared = gensim_models.prepare(lda_model, corpus, dictionary)
    with open(filename, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

  def run_vizualization(self, filename=None):
    filename = filename or self.LDAvis_data_filepath
    with open(filename, 'rb') as f:
      LDAvis_prepared = pickle.load(f)
    pyLDAvis.save_html(LDAvis_prepared, filename +'.html')
    return LDAvis_prepared

  def run(self, reviews_list, n: int = 3, num_topics: int = 10, filename=None):
    filename = filename or self.LDAvis_data_filepath
    reviews = []
    print('preprocessing')
    for review in tqdm_notebook(reviews_list):
      reviews.append(self.preprocess(review))

    ngrams = self.generate_N_grams(reviews, n)
    dictionary = self.make_dictionary(ngrams)
    corpus = self.make_corpus(dictionary, ngrams)
    lda_model = self.lda_model(corpus, dictionary, num_topics=num_topics)

    self.make_vizualization(corpus, dictionary, lda_model, filename)
    return reviews, ngrams, dictionary, corpus, lda_model, self.run_vizualization(filename)

In [None]:
df = pd.read_csv('<path>', sep=';')

In [None]:
df.head()

In [None]:
good_reviews = df[df.rating_value	> 3].review_body
bad_reviews = df[df.rating_value	<= 3].review_body
unknown_reviews = df[df.rating_value.isnull()].review_body

In [None]:
print(len(good_reviews), len(bad_reviews), len(unknown_reviews))

524 7004 6442


In [None]:
stopwords = nltk.corpus.stopwords.words('russian')[:46] + ['все', 'еще', 'это', 'год', 'г']
stopwords.remove('не')
stopwords.remove('нет')
pipeline = Pipeline(stopwords)

In [None]:
_, good_reviews_ngrams, _, _, good_reviews_model, good_reviews_vizualization = \
pipeline.run(good_reviews, filename='good_reviews')

good_reviews_vizualization

preprocessing


  0%|          | 0/524 [00:00<?, ?it/s]

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
_, bad_reviews_ngrams, _, _, bad_reviews_model, bad_reviews_vizualization = \
pipeline.run(bad_reviews, num_topics=5, filename='bad_reviews')
bad_reviews_vizualization

In [None]:
_, unknown_reviews_ngrams, _, _, unknown_reviews_model, unknown_reviews_vizualization = \
pipeline.run(unknown_reviews, num_topics=5, filename='unknown_reviews')
unknown_reviews_vizualization

preprocessing


  0%|          | 0/6442 [00:00<?, ?it/s]

  by='saliency', ascending=False).head(R).drop('saliency', 1)


# Результаты

In [None]:
import textwrap

def print_review(review):
  print('\n'.join(textwrap.wrap(review, 100)))

def reviews_with_keyword(reviews, ngrams, keyword):
  reviews_with_keyword = []
  for i in range(len(reviews)):
    if keyword in ngrams[i]:
      reviews_with_keyword.append(reviews[i])

  print(f'Found {len(reviews_with_keyword)} reviews with keyword {keyword}')
  print('---------')
  for n, review in enumerate(reviews_with_keyword):
    print_review(review)
    print()
    if n == 10:
      break