In [1]:
from artm_experiments import Experiment, Pool, GreedyTopicsFilter, ConvexHullTopicsFilter
from artm import *
import glob
%pylab inline
%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib




In [2]:
batch_vectorizer = BatchVectorizer(data_path='kos', data_format='batches')

def create_model():
    model_artm = ARTM(num_topics=50,
                      scores=[PerplexityScore(name='PerplexityScore',
                                              use_unigram_document_model=False,
                                              dictionary_name='dictionary')],
                      regularizers=[SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.2),
                                    DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=2.5e+5)])

    #model_artm.gather_dictionary(dictionary_target_name='dictionary', 
    #                             data_path='kos', vocab_file_path='vocab.kos.txt')
    #model_artm.save_dictionary(dictionary_name='dictionary', dictionary_path='kos/dictionary.dict')
    model_artm.load_dictionary(dictionary_name='dictionary', dictionary_path='kos/dictionary.dict')
    model_artm.initialize(dictionary_name='dictionary', seed=np.random.randint(1000000))

    return model_artm

In [4]:
def display_points(phi):
    points = ConvexHullTopicsFilter.project_points(phi.as_matrix().T, dim=2)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(points[:, 0], points[:, 1], marker='o', ls='')
    for i in xrange(phi.shape[1]):
        ax.annotate(phi.columns[i], xy=points[i], textcoords='offset points')
        
    plt.show()

In [5]:
exp = Experiment(Pool(topics_filter=ConvexHullTopicsFilter(eps=1e-3, iter_num=10, verbose=False), 
                      save_topics=True))
for i in xrange(10):
    model_artm = create_model()
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=15, num_document_passes=1)
    #display_points(model_artm.get_phi())
    exp.collect_topics(model_artm.get_phi(), model_artm.get_theta())
    print exp.topics_pool.get_basic_topics_count()

35
67
82
92
93
88
86
92
100
115


In [6]:
exp.show_basic_topics()

topic8 | [u'recommendations' u'hurts' u'buying' u'compromise' u'lisa']
topic15 | [u'sharon' u'smith' u'wink' u'schwarz' u'lynch']
topic21 | [u'fighters' u'fantasy' u'mosque' u'murder' u'intel']
topic24 | [u'aflcio' u'loyalty' u'industrial' u'matthews' u'petraeus']
topic28 | [u'cuba' u'divorce' u'dishonest' u'patients' u'fundamentally']
topic29 | [u'bald' u'kids' u'slight' u'abstain' u'peanut']
topic36 | [u'courage' u'postdebate' u'james' u'carlson' u'chamber']
topic44 | [u'wilson' u'soft' u'indicted' u'ruled' u'scientific']
topic52 | [u'chaos' u'execution' u'blew' u'berg' u'beheading']
topic53 | [u'inez' u'chandlers' u'robinson' u'leans' u'retention']
topic61 | [u'toomey' u'hoeffel' u'indiana' u'shares' u'specters']
topic62 | [u'meetup' u'schwarzenegger' u'drudge' u'fax' u'brock']
topic65 | [u'brain' u'ann' u'billmon' u'exposure' u'blanco']
topic67 | [u'fingerhut' u'brush' u'award' u'univ' u'ronk']
topic76 | [u'obama' u'geneva' u'thomas' u'ilsen' u'beauprez']
topic78 | [u'clarke' u'tig

In [7]:
exp.show_all_topics(sort_by_closest_topic=True)

topic100 | [u'mccains' u'airplane' u'cargo' u'conclusions' u'mouse'] | topic100
topic117 | [u'physical' u'eye' u'healthcare' u'robertson' u'personally'] | topic117
topic118 | [u'bloomfield' u'save' u'stonewalling' u'suffolk' u'ronk'] | topic118
topic134 | [u'leaked' u'resonate' u'announcement' u'captain' u'partial'] | topic134
topic148 | [u'coburns' u'rowland' u'bowerss' u'plea' u'playing'] | topic148
topic15 | [u'sharon' u'smith' u'wink' u'schwarz' u'lynch'] | topic15
topic150 | [u'thurlow' u'walmart' u'bronze' u'boxer' u'burt'] | topic150
topic1 | [u'ryan' u'clintons' u'nag' u'endorsements' u'toomey'] | topic151
topic125 | [u'hoeffel' u'measure' u'toomey' u'income' u'households'] | topic151
topic151 | [u'toomey' u'filing' u'hoeffel' u'rick' u'pat'] | topic151
topic287 | [u'pdf' u'toomey' u'missouri' u'surveyusa' u'hoeffel'] | topic151
topic159 | [u'christians' u'jesus' u'blessed' u'guidelines' u'benson'] | topic159
topic168 | [u'qualify' u'goss' u'collecting' u'signatures' u'cosen'] 

In [14]:
exp.topics_pool.get_dist_between_topics('topic10', 'topic60')

0.002252705628052354

In [19]:
exp.show_next_topics_batch(10)

topic9:
[u'clarke' u'schneider' u'indicted' u'indictment' u'rosenberg']
topic71:
[u'ratings' u'caucuses' u'actual' u'film' u'winner']
topic87:
[u'violence' u'veterans' u'mission' u'boat' u'command']
topic117:
[u'nuclear' u'quickly' u'finish' u'precinct' u'dole']
topic139:
[u'capture' u'ralph' u'independents' u'inevitable' u'extremely']
topic151:
[u'town' u'spanish' u'abuses' u'photos' u'hearings']
topic153:
[u'salazar' u'favorite' u'car' u'initial' u'ballots']
topic154:
[u'frost' u'signs' u'armor' u'scalia' u'sirota']
topic156:
[u'incumbents' u'liberals' u'gains' u'tens' u'collected']
topic157:
[u'nancy' u'east' u'religious' u'peace' u'qaeda']


In [34]:
exp.save_dataset_to_navigator()

In [35]:
exp.save_next_topics_batch_ to_navigator(15)

In [None]:
exp.load_assessments_from_navigator()

In [None]:
exp.show_assessments()