# Домашнее задание  № 5. Матричные разложения/Тематическое моделирование

### Задание № 1 (4 балла)

In [1]:
import gensim
import pandas as pd
import numpy as np
from pymorphy2 import MorphAnalyzer
from collections import Counter
from string import punctuation
from razdel import tokenize as razdel_tokenize
from IPython.display import Image
from IPython.core.display import HTML 
from sklearn.decomposition import TruncatedSVD, NMF, PCA, LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
from matplotlib import pyplot as plt
import seaborn as sns
morph = MorphAnalyzer()
warnings.filterwarnings("ignore")

In [2]:
# добавим лемматизацию
def normalize(text):
    normalized_text = [word.text.strip(punctuation) for word \
                                                            in razdel_tokenize(text)]
    normalized_text = [word.lower() for word in normalized_text if word and len(word) < 20 ]
    normalized_text = [morph.parse(word)[0].normal_form for word in normalized_text]
    return ' '.join(normalized_text)

In [3]:
data = pd.read_csv('avito_category_classification.csv')

In [4]:
data['description_norm'] = data['description'].apply(normalize)

In [5]:
vectorizer = CountVectorizer(min_df=5, max_df=0.5)

In [6]:
def eval_table(X, y, pipeline, N=6):
    # зафиксируем порядок классов
    labels = list(set(y))
    
    # метрики отдельных фолдов будет хранить в табличке
    fold_metrics = pd.DataFrame(index=labels)
    # дополнительно также соберем таблицу ошибок
    errors = np.zeros((len(labels), len(labels)))
    
    # создаем стратегию кросс-валидации
    # shuffle=True (перемешивание) - часто критично важно указать
    # т.к. данные могут быть упорядочены и модель на этом обучится
    kfold = StratifiedKFold(n_splits=N, shuffle=True, )
    
    for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
        # fit-predict как и раньше, но сразу пайплайном
        pipeline.fit(X[train_index], y[train_index])
        preds = pipeline.predict(X[test_index])
        
        # записываем метрику и индекс фолда
        fold_metrics[f'precision_{i}'] = precision_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'recall_{i}'] = recall_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'f1_{i}'] = f1_score(y[test_index], preds, labels=labels, average=None)
        errors += confusion_matrix(y[test_index], preds, labels=labels, normalize='true')
    
    # таблица для усредненных значений
    # тут мы берем колонки со значениями и усредняем их
    # часто также все метрики сразу суммируют и в конце просто делят на количество фолдов
    # но мы тут помимо среднего также хотим посмотреть на стандартное отклонение
    # чтобы понять как сильно варьируются оценки моделей
    result = pd.DataFrame(index=labels)
    result['precision'] = fold_metrics[[f'precision_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['precision_std'] = fold_metrics[[f'precision_{i}' for i in range(N)]].std(axis=1).round(2)
    
    result['recall'] = fold_metrics[[f'recall_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['recall_std'] = fold_metrics[[f'recall_{i}' for i in range(N)]].std(axis=1).round(2)
    
    result['f1'] = fold_metrics[[f'f1_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['f1_std'] = fold_metrics[[f'f1_{i}' for i in range(N)]].std(axis=1).round(2)
    
    # добавим одну колонку со средним по всем классам
    result.loc['mean'] = result.mean().round(2)
    # проценты ошибок просто усредняем
    errors /= N
    
    return result, errors

__SGDClassifier__

In [7]:
# NMF
sgdc_nmf = Pipeline([
    ('bow', vectorizer),
    ('scaler', StandardScaler(with_mean=False)),
    ('svd', NMF(60)),
    ('clf', SGDClassifier(max_iter=1000, tol=1e-3))
])

In [8]:
metrics_sgdc_nmf, errors_sgdc_nmf = eval_table(data['description_norm'], data['category_name'], sgdc_nmf)
metrics_sgdc_nmf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Предложение услуг,0.71,0.06,0.61,0.07,0.65,0.05
Ремонт и строительство,0.61,0.13,0.3,0.06,0.39,0.04
Автомобили,0.79,0.11,0.82,0.04,0.8,0.04
Бытовая техника,0.66,0.15,0.17,0.05,0.26,0.06
Телефоны,0.73,0.1,0.6,0.15,0.64,0.08
Товары для детей и игрушки,0.58,0.18,0.6,0.11,0.56,0.07
Детская одежда и обувь,0.54,0.08,0.49,0.33,0.45,0.15
Квартиры,0.82,0.06,0.92,0.05,0.87,0.04
Мебель и интерьер,0.37,0.17,0.36,0.22,0.3,0.07
"Одежда, обувь, аксессуары",0.54,0.08,0.5,0.26,0.47,0.11


In [9]:
# SVD
sgdc_svd = Pipeline([
    ('bow', vectorizer),
    ('scaler', StandardScaler(with_mean=False)),
    ('svd', TruncatedSVD(500)),
    ('clf', SGDClassifier(max_iter=1000, tol=1e-3))
])

In [10]:
metrics_sgdc_svd, errors_sgdc_svd = eval_table(data['description_norm'], data['category_name'], sgdc_svd)
metrics_sgdc_svd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Предложение услуг,0.79,0.05,0.7,0.02,0.74,0.02
Ремонт и строительство,0.6,0.05,0.51,0.07,0.55,0.06
Автомобили,0.79,0.05,0.87,0.03,0.82,0.02
Бытовая техника,0.68,0.08,0.5,0.03,0.57,0.05
Телефоны,0.83,0.07,0.77,0.03,0.79,0.04
Товары для детей и игрушки,0.71,0.03,0.68,0.07,0.69,0.04
Детская одежда и обувь,0.71,0.02,0.81,0.03,0.76,0.02
Квартиры,0.96,0.02,0.91,0.02,0.93,0.01
Мебель и интерьер,0.7,0.07,0.57,0.05,0.63,0.03
"Одежда, обувь, аксессуары",0.72,0.03,0.73,0.03,0.72,0.01


__KNeighborsClassifier__

In [11]:
# NMF
knc_nmf = Pipeline([
    ('bow', vectorizer),
    ('svd', NMF(60)),
    ('clf', KNeighborsClassifier(n_neighbors=7))
])

In [12]:
metrics_knc_nmf, errors_knc_nmf = eval_table(data['description_norm'], data['category_name'], knc_nmf)
metrics_knc_nmf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Предложение услуг,0.52,0.06,0.57,0.06,0.54,0.05
Ремонт и строительство,0.42,0.08,0.18,0.03,0.25,0.05
Автомобили,0.52,0.05,0.64,0.05,0.57,0.05
Бытовая техника,0.27,0.03,0.19,0.04,0.22,0.03
Телефоны,0.7,0.07,0.46,0.1,0.55,0.09
Товары для детей и игрушки,0.54,0.06,0.31,0.05,0.39,0.05
Детская одежда и обувь,0.46,0.02,0.59,0.02,0.52,0.02
Квартиры,0.92,0.03,0.8,0.04,0.85,0.03
Мебель и интерьер,0.29,0.03,0.21,0.04,0.24,0.03
"Одежда, обувь, аксессуары",0.5,0.01,0.57,0.03,0.53,0.02


In [13]:
# SVD
knc_svd = Pipeline([
    ('bow', vectorizer),
    ('svd', TruncatedSVD(500)),
    ('clf', KNeighborsClassifier(n_neighbors=7))
])

In [14]:
metrics_knc_svd, errors_knc_svd = eval_table(data['description_norm'], data['category_name'], knc_svd)
metrics_knc_svd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Предложение услуг,0.7,0.05,0.51,0.04,0.59,0.04
Ремонт и строительство,0.4,0.08,0.21,0.06,0.28,0.06
Автомобили,0.52,0.04,0.63,0.03,0.57,0.02
Бытовая техника,0.42,0.06,0.29,0.05,0.35,0.05
Телефоны,0.72,0.05,0.37,0.03,0.49,0.02
Товары для детей и игрушки,0.6,0.06,0.3,0.05,0.4,0.05
Детская одежда и обувь,0.48,0.01,0.68,0.02,0.57,0.01
Квартиры,0.94,0.04,0.78,0.03,0.85,0.03
Мебель и интерьер,0.44,0.04,0.29,0.04,0.35,0.04
"Одежда, обувь, аксессуары",0.51,0.02,0.61,0.02,0.56,0.02


__RandomForestClassifier__

In [15]:
# NMF
rfc_nmf = Pipeline([
    ('bow', vectorizer),
    ('svd', NMF(60)),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=10))
])

In [16]:
metrics_rfc_nmf, errors_rfc_nmf = eval_table(data['description_norm'], data['category_name'], rfc_nmf)
metrics_rfc_nmf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Предложение услуг,0.7,0.05,0.58,0.02,0.63,0.02
Ремонт и строительство,0.66,0.05,0.23,0.05,0.34,0.05
Автомобили,0.81,0.06,0.77,0.04,0.79,0.03
Бытовая техника,1.0,0.0,0.04,0.02,0.08,0.04
Телефоны,0.81,0.05,0.65,0.05,0.72,0.04
Товары для детей и игрушки,0.81,0.05,0.44,0.03,0.57,0.03
Детская одежда и обувь,0.66,0.03,0.68,0.03,0.67,0.02
Квартиры,0.93,0.02,0.94,0.02,0.93,0.01
Мебель и интерьер,0.64,0.07,0.27,0.07,0.38,0.08
"Одежда, обувь, аксессуары",0.49,0.01,0.79,0.02,0.6,0.0


In [17]:
# SVD
rfc_svd = Pipeline([
    ('bow', vectorizer),
    ('svd', TruncatedSVD(500)),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=10))
])

In [18]:
metrics_rfc_svd, errors_rfc_svd = eval_table(data['description_norm'], data['category_name'], rfc_svd)
metrics_rfc_svd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Предложение услуг,0.77,0.02,0.51,0.03,0.61,0.01
Ремонт и строительство,0.66,0.18,0.09,0.03,0.16,0.06
Автомобили,0.9,0.04,0.59,0.02,0.71,0.02
Бытовая техника,0.76,0.16,0.05,0.02,0.1,0.03
Телефоны,0.93,0.06,0.42,0.07,0.58,0.08
Товары для детей и игрушки,0.82,0.07,0.23,0.04,0.36,0.05
Детская одежда и обувь,0.47,0.02,0.73,0.02,0.57,0.02
Квартиры,0.87,0.02,0.92,0.02,0.89,0.02
Мебель и интерьер,0.88,0.16,0.04,0.02,0.08,0.04
"Одежда, обувь, аксессуары",0.49,0.01,0.76,0.01,0.59,0.01


__ExtraTreesClassifier__

In [19]:
# NMF
etc_nmf = Pipeline([
    ('bow', vectorizer),
    ('svd', NMF(60)),
    ('clf', ExtraTreesClassifier(random_state=0))
])

In [20]:
metrics_etc_nmf, errors_etc_nmf = eval_table(data['description_norm'], data['category_name'], etc_nmf)
metrics_etc_nmf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Предложение услуг,0.71,0.05,0.77,0.03,0.74,0.03
Ремонт и строительство,0.6,0.06,0.39,0.08,0.47,0.08
Автомобили,0.78,0.01,0.87,0.04,0.82,0.02
Бытовая техника,0.53,0.09,0.22,0.04,0.31,0.05
Телефоны,0.82,0.03,0.73,0.03,0.77,0.02
Товары для детей и игрушки,0.68,0.04,0.53,0.05,0.59,0.04
Детская одежда и обувь,0.63,0.02,0.73,0.01,0.68,0.01
Квартиры,0.92,0.01,0.96,0.02,0.94,0.01
Мебель и интерьер,0.62,0.06,0.39,0.05,0.48,0.05
"Одежда, обувь, аксессуары",0.63,0.02,0.71,0.02,0.67,0.01


In [21]:
# SVD
etc_svd = Pipeline([
    ('bow', vectorizer),
    ('svd', TruncatedSVD(500)),
    ('clf', ExtraTreesClassifier(random_state=0))
])

In [22]:
metrics_etc_svd, errors_etc_svd = eval_table(data['description_norm'], data['category_name'], etc_svd)
metrics_etc_svd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Предложение услуг,0.85,0.04,0.27,0.03,0.41,0.04
Ремонт и строительство,0.58,0.04,0.09,0.01,0.16,0.02
Автомобили,0.78,0.07,0.46,0.04,0.58,0.04
Бытовая техника,0.64,0.13,0.14,0.02,0.22,0.04
Телефоны,0.88,0.05,0.36,0.05,0.51,0.05
Товары для детей и игрушки,0.67,0.04,0.19,0.03,0.29,0.04
Детская одежда и обувь,0.46,0.01,0.74,0.05,0.57,0.02
Квартиры,0.75,0.03,0.81,0.06,0.77,0.04
Мебель и интерьер,0.71,0.05,0.15,0.02,0.24,0.03
"Одежда, обувь, аксессуары",0.48,0.02,0.73,0.01,0.58,0.01


In [23]:
all_metrics = [metrics_sgdc_nmf, metrics_sgdc_svd, metrics_knc_nmf, metrics_knc_svd, 
                 metrics_rfc_nmf, metrics_rfc_svd, metrics_etc_nmf, metrics_etc_svd]

all_names = ['SGDClassifier_NMF','SGDClassifier_SVD', 'KNeighborsClassifier_NMF', 'KNeighborsClassifier_SVD',
                'RandomForestClassifier_NMF', 'RandomForestClassifier_SVD', 'ExtraTreesClassifier_NMF', 'ExtraTreesClassifier_SVD']

In [24]:
metrics_comparison = pd.DataFrame()
for m in all_metrics:
    metrics_comparison = metrics_comparison.append(m.loc['mean'])
metrics_comparison.index=pd.Series(all_names)
display(metrics_comparison)

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
SGDClassifier_NMF,0.64,0.11,0.54,0.13,0.54,0.07
SGDClassifier_SVD,0.75,0.05,0.7,0.04,0.72,0.03
KNeighborsClassifier_NMF,0.51,0.04,0.45,0.05,0.47,0.04
KNeighborsClassifier_SVD,0.57,0.04,0.47,0.04,0.5,0.03
RandomForestClassifier_NMF,0.75,0.04,0.54,0.04,0.57,0.03
RandomForestClassifier_SVD,0.76,0.07,0.43,0.03,0.46,0.03
ExtraTreesClassifier_NMF,0.69,0.04,0.63,0.04,0.65,0.03
ExtraTreesClassifier_SVD,0.68,0.05,0.39,0.03,0.43,0.03


__Судя по результатам, наилучшее сочетание - это SGDClassifier и SVD.__

### Задание № 2 (6 баллов)

In [25]:
def metrics(corpus, texts, dictinary, lda):
    print(f'Perplexity: {np.exp2(-lda.log_perplexity(corpus))}')
    
    topics = []
    for topic_id, topic in lda.show_topics(num_topics=100, formatted=False):
        topic = [word for word, _ in topic]
        topics.append(topic)
        
    coherence_model_lda = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=[text.split() for text in texts_norm], 
                                                   dictionary=dictinary, coherence='c_v')
    
    print(f'Coherence: {coherence_model_lda.get_coherence()}')

__Модель без нграмм и tfidf__

In [26]:
texts = open('wiki_data.txt', encoding='utf-8').read().splitlines()[:5000]
texts_norm = ([normalize(text) for text in texts])

In [27]:
dictinary_1 = gensim.corpora.Dictionary((text.split() for text in texts_norm))

In [28]:
dictinary_1.filter_extremes(no_above=0.1, no_below=10)
dictinary_1.compactify()

In [29]:
corpus_1 = [dictinary_1.doc2bow(text.split()) for text in texts_norm]

In [30]:
lda_1 = gensim.models.LdaMulticore(corpus_1, 
                                 100, # колиество тем
                                 alpha='asymmetric',
                                 id2word=dictinary_1, 
                                 passes=10) 

In [31]:
metrics_1 = metrics(corpus_1, texts_norm, dictinary_1, lda_1)
print(metrics_1)

Perplexity: 337.5593909373696
Coherence: 0.51777926960439
None


__Модель с нграммами__

In [32]:
ngrams_1 = [text.split() for text in texts_norm]
ph = gensim.models.Phrases(ngrams_1, scoring='npmi', threshold=0.4) # threshold можно подбирать
p = gensim.models.phrases.Phraser(ph)
ngrammed_texts = p[ngrams_1] 

Проверка

In [33]:
[text[:15] for text in ngrammed_texts[:1]]

[['новостройка',
  'нижегородский_область',
  'новостро́йка',
  '—',
  'сельский',
  'посёлок',
  'в',
  'дивеевский_район',
  'нижегородский_область',
  'входить',
  'в',
  'состав_сатисский',
  'сельсовет',
  'посёлок',
  'расположить']]

In [34]:
dictinary_2 = gensim.corpora.Dictionary((ngrammed_texts))

In [35]:
dictinary_2.filter_extremes(no_above=0.1, no_below=10)
dictinary_2.compactify()

In [36]:
corpus_2 = [dictinary_2.doc2bow(text) for text in ngrammed_texts]

In [37]:
lda_2 = gensim.models.LdaMulticore(corpus_2, 
                                 100, # число тем
                                 alpha='asymmetric',
                                 id2word=dictinary_2, 
                                 passes=10) 

In [38]:
metrics_2 = metrics(corpus_2, ngrammed_texts, dictinary_2, lda_2)
print(metrics_2)

Perplexity: 430.19540812345906
Coherence: nan
None


__Модель с tfidf__

In [39]:
texts_ = open('wiki_data.txt', encoding='utf-8').read().splitlines()[:5000]
texts_norm = ([normalize(text) for text in texts_])

In [40]:
dictinary_3 = gensim.corpora.Dictionary((text.split() for text in texts_norm))

In [41]:
dictinary_3.filter_extremes(no_above=0.1, no_below=10)
dictinary_3.compactify()

In [42]:
corpus_3 = [dictinary_3.doc2bow(text.split()) for text in texts_norm]

In [43]:
tfidf = gensim.models.TfidfModel(corpus_3)
corpus_3 = [tfidf[text] for text in corpus_3]

In [45]:
lda_3 = gensim.models.LdaMulticore(corpus_3, 
                                 100, # число тем
                                 alpha='asymmetric',
                                 id2word=dictinary_3, 
                                 passes=10)

In [46]:
metrics_3 = metrics(corpus_3, texts_norm, dictinary_3, lda_3)
print(metrics_3)

Perplexity: 20364.41663545552
Coherence: 0.4055424811098349
None


__Модель с нграммами и tfidf__

In [47]:
tfidf = gensim.models.TfidfModel(corpus_2)
corpus_4 = [tfidf[text] for text in corpus_2]

In [48]:
lda_4 = gensim.models.LdaMulticore(corpus_4, 
                                 100, # число тем
                                 alpha='asymmetric',
                                 id2word=dictinary_2, 
                                 passes=2) 

In [49]:
metrics_4 = metrics(corpus_4, texts_norm, dictinary_2, lda_4)
print(metrics_4)

Perplexity: 271051.81226070283
Coherence: nan
None


__Лучшей оказалась самая первая модель, как в семинаре. Почему-то для нграмных моделей Coherence выдает nan. К сожалению, с этом я разобраться не смогла.__

In [53]:
lda_1.print_topics()

[(99,
  '0.020*"оркестр" + 0.014*"десант" + 0.013*"музыка" + 0.011*"инструмент" + 0.008*"высадка" + 0.007*"играть" + 0.006*"зал" + 0.005*"морской" + 0.005*"здание" + 0.005*"рог"'),
 (98,
  '0.057*"река" + 0.053*"км" + 0.029*"берег" + 0.026*"сельский" + 0.024*"дорога" + 0.021*"совет" + 0.021*"озеро" + 0.019*"харьковский" + 0.018*"расстояние" + 0.017*"станция"'),
 (97,
  '0.030*"канада" + 0.012*"ибн" + 0.009*"герб" + 0.008*"’" + 0.008*"список" + 0.007*"польский" + 0.006*"качество" + 0.006*"польша" + 0.006*"национальный" + 0.006*"уровень"'),
 (96,
  '0.077*"деревня" + 0.029*"сельсовет" + 0.013*"коммуна" + 0.010*"п" + 0.009*"литва" + 0.009*"двор" + 0.008*"житель" + 0.007*"данные" + 0.007*"дом" + 0.007*"губерния"'),
 (94,
  '0.017*"серия" + 0.016*"брат" + 0.013*"озеро" + 0.012*"грязь" + 0.011*"книга" + 0.011*"джейн" + 0.009*"друг" + 0.009*"произведение" + 0.008*"знание" + 0.007*"джо"'),
 (95,
  '0.019*"команда" + 0.017*"сезон" + 0.013*"f" + 0.012*"ferrari" + 0.010*"болид" + 0.009*"брянский"

__Самая красивая тема:__ (96,
  '0.077*"деревня" + 0.029*"сельсовет" + 0.013*"коммуна" + 0.010*"п" + 0.009*"литва" + 0.009*"двор" + 0.008*"житель" + 0.007*"данные" + 0.007*"дом" + 0.007*"губерния"')

In [54]:
lda_2.print_topics()

[(99,
  '0.023*"иордания" + 0.016*"израиль" + 0.008*"епархия" + 0.006*"организация" + 0.006*"животное" + 0.006*"якутский" + 0.006*"израильский" + 0.006*"сентябрь" + 0.005*"лесной" + 0.005*"сторона"'),
 (98,
  '0.025*"звезда" + 0.015*"кладбище" + 0.008*"система" + 0.007*"эстония" + 0.007*"иран" + 0.006*"солнце" + 0.006*"мемориал" + 0.006*"колхоз" + 0.005*"b" + 0.005*"участок"'),
 (97,
  '0.011*"мочь" + 0.009*"парк" + 0.007*"авиакомпания" + 0.007*"каждый" + 0.007*"цилиндр" + 0.006*"система" + 0.006*"проект" + 0.005*"microsoft" + 0.005*"изображение" + 0.004*"версия"'),
 (95,
  '0.013*"растение" + 0.010*"земля" + 0.009*"напряжение" + 0.009*"одесса" + 0.008*"вода" + 0.008*"одесский" + 0.008*"антенна" + 0.007*"писатель" + 0.005*"обычно" + 0.005*"воздух"'),
 (96,
  '0.051*"клуб" + 0.030*"команда" + 0.028*"матч" + 0.012*"гол" + 0.012*"перейти" + 0.011*"чемпионат" + 0.011*"игрок" + 0.011*"провести" + 0.010*"сезон" + 0.010*"игра"'),
 (94,
  '0.021*"подводный_лодка" + 0.010*"село" + 0.010*"флот" 

__Самая красивая тема:__ (9,
  '0.055*"игра" + 0.012*"игрок" + 0.009*"уровень" + 0.009*"император" + 0.007*"враг" + 0.007*"мочь" + 0.005*"компьютерный_игра" + 0.005*"игровой" + 0.005*"персонаж" + 0.005*"можно"')

In [55]:
lda_3.print_topics()

[(98,
  '0.000*"просвещение" + 0.000*"издательство" + 0.000*"компания" + 0.000*"учебник" + 0.000*"матч" + 0.000*"забить" + 0.000*"цска" + 0.000*"гол" + 0.000*"президент" + 0.000*"co"'),
 (99,
  '0.000*"портрет" + 0.000*"икона" + 0.000*"комната" + 0.000*"санкт-петербург" + 0.000*"александр" + 0.000*"авторский" + 0.000*"мастер" + 0.000*"александрович" + 0.000*"казанский" + 0.000*"техника"'),
 (97,
  '0.005*"подбор" + 0.001*"тим" + 0.000*"подразумеваться" + 0.000*"ричард" + 0.000*"ирландия" + 0.000*"сезон" + 0.000*"генрих" + 0.000*"севастополь" + 0.000*"де" + 0.000*"очко"'),
 (96,
  '0.000*"билет" + 0.000*"посёлок" + 0.000*"совхоз" + 0.000*"портрет" + 0.000*"житель" + 0.000*"замок" + 0.000*"печать" + 0.000*"полевой" + 0.000*"усадьба" + 0.000*"км"'),
 (95,
  '0.001*"патрик" + 0.000*"животное" + 0.000*"смотреть" + 0.000*"ян" + 0.000*"рига" + 0.000*"святой" + 0.000*"король" + 0.000*"как-то" + 0.000*"эссе" + 0.000*"построить"'),
 (94,
  '0.004*"крепиться" + 0.003*"единичный" + 0.001*"белок" +

__Самая красивая тема:__ (1,
  '0.082*"житомирский" + 0.034*"р-н" + 0.034*"телефонный" + 0.034*"почтовый" + 0.033*"индекс" + 0.032*"км²" + 0.032*"коатуа" + 0.029*"ул" + 0.026*"радомышльский" + 0.021*"романовский"'),

In [56]:
lda_4.print_topics()

[(99,
  '0.004*"стамбул" + 0.004*"альберт" + 0.004*"герб" + 0.003*"украшение" + 0.002*"провинциальный" + 0.002*"н_э" + 0.002*"символизировать" + 0.002*"драгоценный_камень" + 0.002*"пересечь" + 0.002*"изогнутый"'),
 (98,
  '0.006*"волга" + 0.006*"марк" + 0.004*"стр" + 0.003*"станица" + 0.002*"соревнование" + 0.002*"казак" + 0.002*"деревня" + 0.002*"поляк" + 0.002*"команда" + 0.002*"выпуск"'),
 (96,
  '0.002*"жалоба" + 0.001*"чемпионат_мир" + 0.001*"среди_юниор" + 0.001*"судья" + 0.001*"монгольский" + 0.001*"суд" + 0.001*"рекорд" + 0.001*"быть_назначить" + 0.001*"завоевать" + 0.001*"должность"'),
 (97,
  '0.019*"белокалитвинский_район" + 0.008*"алла_пугачёв" + 0.003*"прогноз" + 0.003*"песня" + 0.002*"карта" + 0.002*"дом" + 0.002*"писатель" + 0.001*"округ" + 0.001*"станица" + 0.001*"шторм"'),
 (95,
  '0.000*"расстояние_между" + 0.000*"сексуальный" + 0.000*"психический" + 0.000*"психологический" + 0.000*"раздел" + 0.000*"разделиться" + 0.000*"разрабатываться" + 0.000*"прозвучать" + 0.000*"

__Самая красивая тема:__ (93,
  '0.002*"полка" + 0.002*"противник" + 0.002*"самолёт" + 0.002*"батальон" + 0.001*"старший_лейтенант" + 0.001*"лётчик" + 0.001*"истребитель" + 0.001*"штаб" + 0.001*"командир" + 0.001*"бой"')