In [2]:
import artm

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid", {'axes.grid' : False})

In [None]:
import numpy as np
import pandas as pd
from sklearn.externals import joblib

In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [49]:
batch_vectorizer = artm.BatchVectorizer(data_path='.',
                                        data_format='bow_uci',
                                        collection_name='rbc',
                                        target_folder='rbc_batches')

In [128]:
np.random.seed(1)
dictionary = artm.Dictionary('dict')
my_dictionary.gather(data_path='rbc_batches',
                  vocab_file_path='vocab.rbc.txt')
model.initialize(dictionary=my_dictionary)

In [130]:
my_dictionary.save(dictionary_path='rbc_batches/my_dictionary')

In [131]:
my_dictionary.load(dictionary_path='rbc_batches/my_dictionary.dict')

In [133]:
batch_vectorizer.dictionary

artm.Dictionary(name=62d732d9-dc07-4abb-91da-2ef12abd738c, num_entries=689)

In [134]:
T = 26 # number of topics
topic_names=["Topic_"+(str(i)) for i in range(T-1)]+["bcg"]

model = artm.ARTM(num_topics=T, topic_names=topic_names, num_processors=2,
                  reuse_theta=True, cache_theta=True, dictionary=my_dictionary)

In [135]:
model.scores.add(artm.TopTokensScore(name='top_tokens_score_mod1', num_tokens=15))

In [136]:
model.regularizers.add(artm.SmoothSparsePhiRegularizer(tau=1e5, class_ids='text', dictionary='dict', topic_names='bcg'))

In [137]:
model.num_document_passes = 1
model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=26)

In [138]:
topic_names_cleared = list(topic_names).remove('bcg')
model.regularizers.add(artm.SmoothSparsePhiRegularizer(tau=-1e5, class_ids='text', dictionary='dict',
                                                       topic_names=topic_names_cleared))

In [139]:
model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=15)

In [141]:
tokens = model.score_tracker['top_tokens_score_mod1'].last_tokens
for topic_name in model.topic_names:
    print (topic_name + ': ')
    for word in tokens[topic_name]:    
        print(word, end = ", ")
    print("\n")

Topic_0: 
турист, банкротство, фонд, выплата, деятельность, проблема, ростуризм, предлагать, получать, случай, туризм, человек, туроператор, направление, который, 

Topic_1: 
газпром, объем, европейский, поставка, европа, декабрь, показатель, заявка, рамка, контрактный, млрд, сообщать, прошлое, gazp, ниже, 

Topic_2: 
крупный, ведомство, цена, млрд, участник, соглашение, шесть, сумма, срок, свобода, размер, проводить, поддерживать, мочь, конкуренция, 

Topic_3: 
система, который, туризм, задача, страна, путевка, работать, бизнес, туроператор, обязательство, деньги, гостиница, направление, самый, греция, 

Topic_4: 
догузов, ростуризм, туроператор, глава, турист, должный, путевка, фонд, заявлять, туристический, случай, выплата, пандемия, турпродукт, клиент, 

Topic_5: 
который, россия, свой, самый, страна, направление, говорить, бизнес, давать, электронный, поэтому, нужно, многие, президент, высокий, 

Topic_6: 
который, человек, россия, инвестиция, мочь, новый, самый, время, директор, 

In [142]:
T = 12 # number of topics
topic_names=["Topic_"+(str(i)) for i in range(T-1)]+["bcg"]

lda = artm.LDA(num_topics=15, alpha=0.01, beta=0.001,
               num_document_passes=5, dictionary=my_dictionary,
               cache_theta=True)

In [143]:
lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10)

In [144]:
lda.sparsity_phi_last_value
lda.sparsity_theta_last_value

0.0

0.0

In [145]:
lda.perplexity_value

[681.2626953125,
 416.35003662109375,
 321.06982421875,
 241.34381103515625,
 214.24884033203125,
 203.34326171875,
 196.08868408203125,
 191.6376190185547,
 189.9319610595703,
 189.30413818359375]

In [146]:
top_tokens = lda.get_top_tokens(num_tokens=10)
for i, token_list in enumerate(top_tokens):
    print('Topic #{0}: {1}'.format(i, token_list),"\n")

Topic #0: ['турист', 'фонд', 'туроператор', 'банкротство', 'случай', 'получать', 'ростуризм', 'предлагать', 'выплата', 'система'] 

Topic #1: ['газпром', 'поставка', 'европа', 'объем', 'цена', 'европейский', 'заявка', 'сообщать', 'свой', 'представитель'] 

Topic #2: ['крупный', 'ведомство', 'млрд', 'цена', 'участник', 'соглашение', 'сумма', 'мочь', 'срок', 'проводить'] 

Topic #3: ['который', 'турист', 'туризм', 'система', 'путевка', 'задача', 'каждый', 'туроператор', 'получать', 'гостиница'] 

Topic #4: ['туроператор', 'ростуризм', 'компенсация', 'догузов', 'случай', 'фонд', 'ответственность', 'глава', 'турист', 'турфирма'] 

Topic #5: ['турист', 'который', 'туроператор', 'туризм', 'россия', 'самый', 'направление', 'ростуризм', 'давать', 'программа'] 

Topic #6: ['который', 'человек', 'россия', 'инвестиция', 'гендиректор', 'самый', 'новый', 'андрей', 'технология', 'сергей'] 

Topic #7: ['продукция', 'закон', 'который', 'импортер', 'рынок', 'новый', 'бизнес', 'россия', 'правительство',

In [120]:
phi = lda.phi_
theta = lda.get_theta()

In [121]:
phi

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14
gazp,0.000002,0.007690,0.00001,0.000002,0.000004,0.000005,0.000002,0.000002,0.010203,0.000007,0.000008,0.000002,0.000002,0.000005,0.000004
group,0.000004,0.000007,0.00001,0.000423,0.000004,0.001334,0.003627,0.006529,0.000011,0.000007,0.001072,0.000002,0.000434,0.000005,0.000619
mouzenidis,0.006047,0.000007,0.00001,0.003494,0.000694,0.004839,0.000002,0.000002,0.000011,0.000007,0.008211,0.000002,0.003915,0.000005,0.001960
travel,0.005257,0.000007,0.00001,0.008163,0.001677,0.003261,0.000002,0.000002,0.000011,0.000007,0.001686,0.000002,0.001667,0.000005,0.003272
агентство,0.002930,0.007212,0.00001,0.001641,0.000460,0.000903,0.000002,0.000002,0.000011,0.000007,0.002407,0.000002,0.002125,0.000005,0.001019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
эффективный,0.000004,0.000007,0.00001,0.002355,0.000004,0.002868,0.001814,0.000002,0.000011,0.000007,0.001126,0.000002,0.001676,0.000005,0.001168
южный,0.003076,0.000007,0.00001,0.000305,0.000320,0.000678,0.001815,0.000002,0.000011,0.000007,0.000488,0.000002,0.000424,0.000005,0.000186
юрлица,0.006515,0.000007,0.00001,0.000117,0.000159,0.000047,0.000002,0.000002,0.000011,0.000007,0.000147,0.000002,0.000212,0.000005,0.000040
являться,0.000004,0.000007,0.00001,0.002471,0.000005,0.004021,0.005440,0.000002,0.000011,0.000007,0.000659,0.002423,0.000765,0.000005,0.001766


In [122]:
theta

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
topic_0,2.4e-05,8.4e-05,3e-05,7.8e-05,4.6e-05,5.5e-05,0.000113,0.575602,0.00011,0.066837
topic_1,2.8e-05,0.998706,3.6e-05,9.3e-05,2.9e-05,7e-05,0.00012,2.3e-05,0.07798,1.1e-05
topic_2,2.8e-05,9.8e-05,4.5e-05,8.4e-05,2.8e-05,6e-05,0.998398,3.2e-05,0.000122,1.8e-05
topic_3,3.2e-05,8.6e-05,3.2e-05,8.3e-05,4.8e-05,5.8e-05,0.00011,4e-05,0.000113,0.252386
topic_4,2.3e-05,8.1e-05,3e-05,7.8e-05,0.021103,5.5e-05,0.000109,0.423963,0.000107,0.022158
topic_5,3.9e-05,9.1e-05,3.3e-05,8.3e-05,4.5e-05,6e-05,0.000111,3.7e-05,0.000114,0.116861
topic_6,0.999581,9.2e-05,3.5e-05,9.1e-05,3.5e-05,6.6e-05,0.000111,2.6e-05,0.000113,3.6e-05
topic_7,3.9e-05,9.1e-05,0.999509,8.9e-05,3.7e-05,7.6e-05,0.000134,2.9e-05,0.000119,1.5e-05
topic_8,2.5e-05,0.00013,3.8e-05,9.9e-05,2.7e-05,7.1e-05,0.000121,2.6e-05,0.92051,1.2e-05
topic_9,3.6e-05,0.0001,3.9e-05,0.998804,3.7e-05,6.8e-05,0.000117,2.6e-05,0.000133,2.2e-05


In [127]:
# add test batch for the model