In [1]:
# In this workshop we perform topic modeling using gensim

# We are using the subnews dataset that we used last week. 
# The "Class" labels here are only used for sanity check of the topics discovered later.
# Remember, in actual use of topic modelling, the documents DON'T come with labeled classes.
# It's unsupervised learning.

import pandas as pd
news=pd.read_table('r8-train-all-terms.txt',header=None,names = ["Class", "Text"])
subnews=news[(news.Class=="trade")| (news.Class=='crude')|(news.Class=='money-fx') ]
subnews.head()

Unnamed: 0,Class,Text
15,trade,brazil anti inflation plan limps to anniversar...
43,crude,diamond shamrock dia cuts crude prices diamond...
55,crude,opec may have to meet to firm prices analysts ...
76,crude,texaco canada cuts crude prices canadian cts b...
77,crude,texaco canada txc lowers crude postings texaco...


In [2]:
# Let's use the similar preprocessing we used last week.
# The output of each document is a list of tokens.

import nltk
from nltk.corpus import stopwords
mystopwords=stopwords.words("English") + ['one', 'become', 'get', 'make', 'take']
WNlemma = nltk.WordNetLemmatizer()

def pre_process(text):
    tokens = nltk.word_tokenize(text)
    tokens=[ WNlemma.lemmatize(t.lower()) for t in tokens]
    tokens=[ t for t in tokens if t not in mystopwords]
    tokens = [ t for t in tokens if len(t) >= 3 ]
    return(tokens)

In [3]:
#split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(subnews.Text, subnews.Class, test_size=0.33, random_state=12)


In [4]:
# Apply preprocessing to every document in the training set.
toks_train = X_train.apply(pre_process)


In [5]:
#!pip install gensim

In [6]:
import logging
import gensim 
from gensim import corpora

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [7]:
# Prepare a vocabulary dictionary.
dictionary = corpora.Dictionary(toks_train)
print(dictionary)

2018-09-15 08:34:20,736 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-09-15 08:34:20,844 : INFO : built Dictionary(5971 unique tokens: ['aide', 'amendment', 'annual', 'approved', 'approves']...) from 475 documents (total 57987 corpus positions)


Dictionary(5971 unique tokens: ['aide', 'amendment', 'annual', 'approved', 'approves']...)


In [8]:
# Filter off any words with document frequency less than 3, or appearing in more than 80% documents
dictionary.filter_extremes(no_below=3, no_above=0.7)
print(dictionary)

2018-09-15 08:34:20,859 : INFO : discarding 3537 tokens: [('frightening', 2), ('mandating', 2), ('reuter', 408), ('said', 403), ('toughening', 2), ('wedge', 2), ('aggravated', 1), ('delay', 1), ('devaluation', 2), ('herald', 1)]...
2018-09-15 08:34:20,862 : INFO : keeping 2434 tokens which were in no less than 3 and no more than 332 (=70.0%) documents
2018-09-15 08:34:20,866 : INFO : resulting dictionary: Dictionary(2434 unique tokens: ['aide', 'amendment', 'annual', 'approved', 'approves']...)


Dictionary(2434 unique tokens: ['aide', 'amendment', 'annual', 'approved', 'approves']...)


In [9]:
type(dictionary)

gensim.corpora.dictionary.Dictionary

In [10]:
# Use the dictionary to prepare a DTM (using TF)
dtm_train = [dictionary.doc2bow(d) for d in toks_train ]

In [11]:
# Build an LDA model for 3 topics out of the DTM
%time lda = gensim.models.ldamodel.LdaModel(dtm_train, num_topics = 3,  alpha='auto',chunksize=30,id2word = dictionary, passes = 20,random_state=432)
#chunksize (int, optional) – Number of documents to be used in each training chunk.
#passes (int, optional) – Number of passes through the corpus during training.

2018-09-15 08:34:21,054 : INFO : using autotuned alpha, starting with [0.33333334, 0.33333334, 0.33333334]
2018-09-15 08:34:21,056 : INFO : using symmetric eta at 0.3333333333333333
2018-09-15 08:34:21,058 : INFO : using serial LDA version on this node
2018-09-15 08:34:21,062 : INFO : running online (multi-pass) LDA training, 3 topics, 20 passes over the supplied corpus of 475 documents, updating model once every 30 documents, evaluating perplexity every 300 documents, iterating 50x with a convergence threshold of 0.001000
2018-09-15 08:34:21,062 : INFO : PROGRESS: pass 0, at document #30/475
2018-09-15 08:34:21,111 : INFO : optimized alpha [0.5270892, 0.50643796, 0.48903793]
2018-09-15 08:34:21,114 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:21,121 : INFO : topic #0 (0.527): 0.027*"trade" + 0.013*"china" + 0.013*"oil" + 0.012*"year" + 0.011*"export" + 0.011*"would" + 0.009*"last" + 0.009*"gatt" + 0.008*"system" + 0.008*"currency"
2018-09-1

2018-09-15 08:34:21,586 : INFO : optimized alpha [0.5715378, 0.47655696, 0.52661896]
2018-09-15 08:34:21,587 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:21,590 : INFO : topic #0 (0.572): 0.028*"trade" + 0.017*"japan" + 0.015*"would" + 0.009*"year" + 0.008*"world" + 0.008*"agreement" + 0.007*"minister" + 0.007*"import" + 0.007*"foreign" + 0.007*"market"
2018-09-15 08:34:21,591 : INFO : topic #1 (0.477): 0.046*"oil" + 0.029*"mln" + 0.018*"price" + 0.014*"barrel" + 0.014*"opec" + 0.014*"pct" + 0.013*"market" + 0.012*"production" + 0.010*"stg" + 0.010*"bank"
2018-09-15 08:34:21,592 : INFO : topic #2 (0.527): 0.020*"billion" + 0.015*"dlrs" + 0.015*"dollar" + 0.015*"bank" + 0.014*"japan" + 0.014*"official" + 0.014*"japanese" + 0.011*"market" + 0.009*"trade" + 0.009*"yen"
2018-09-15 08:34:21,593 : INFO : topic diff=0.477447, rho=0.333333
2018-09-15 08:34:21,639 : INFO : -7.282 per-word bound, 155.7 perplexity estimate based on a held-out corpus of

2018-09-15 08:34:22,027 : INFO : topic #1 (0.398): 0.043*"oil" + 0.025*"mln" + 0.022*"price" + 0.018*"opec" + 0.017*"bpd" + 0.016*"crude" + 0.012*"dlrs" + 0.012*"barrel" + 0.011*"pct" + 0.009*"market"
2018-09-15 08:34:22,028 : INFO : topic #2 (0.366): 0.028*"billion" + 0.018*"dlrs" + 0.018*"bank" + 0.015*"official" + 0.013*"japanese" + 0.011*"dollar" + 0.010*"market" + 0.010*"february" + 0.009*"year" + 0.008*"last"
2018-09-15 08:34:22,028 : INFO : topic diff=0.275793, rho=0.236801
2018-09-15 08:34:22,029 : INFO : PROGRESS: pass 1, at document #60/475
2018-09-15 08:34:22,057 : INFO : optimized alpha [0.42603716, 0.3405189, 0.33244377]
2018-09-15 08:34:22,057 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:22,060 : INFO : topic #0 (0.426): 0.033*"trade" + 0.013*"would" + 0.012*"country" + 0.011*"japan" + 0.008*"year" + 0.007*"state" + 0.007*"foreign" + 0.007*"agreement" + 0.006*"economic" + 0.006*"market"
2018-09-15 08:34:22,061 : INFO : topic #1

2018-09-15 08:34:22,400 : INFO : PROGRESS: pass 1, at document #300/475
2018-09-15 08:34:22,422 : INFO : optimized alpha [0.33410797, 0.27763724, 0.29562563]
2018-09-15 08:34:22,423 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:22,426 : INFO : topic #0 (0.334): 0.031*"trade" + 0.016*"japan" + 0.016*"would" + 0.009*"world" + 0.008*"year" + 0.008*"agreement" + 0.008*"country" + 0.007*"minister" + 0.007*"market" + 0.006*"foreign"
2018-09-15 08:34:22,427 : INFO : topic #1 (0.278): 0.048*"oil" + 0.026*"mln" + 0.020*"price" + 0.014*"opec" + 0.014*"dlrs" + 0.013*"barrel" + 0.013*"pct" + 0.011*"crude" + 0.011*"market" + 0.010*"stg"
2018-09-15 08:34:22,428 : INFO : topic #2 (0.296): 0.024*"billion" + 0.020*"bank" + 0.017*"dollar" + 0.016*"dlrs" + 0.014*"japanese" + 0.013*"official" + 0.012*"market" + 0.011*"japan" + 0.010*"trade" + 0.009*"chip"
2018-09-15 08:34:22,429 : INFO : topic diff=0.243499, rho=0.236801
2018-09-15 08:34:22,429 : INFO : PROGRESS

2018-09-15 08:34:22,790 : INFO : topic #1 (0.230): 0.043*"oil" + 0.025*"mln" + 0.022*"price" + 0.019*"opec" + 0.017*"bpd" + 0.016*"crude" + 0.014*"dlrs" + 0.013*"barrel" + 0.009*"ecuador" + 0.009*"saudi"
2018-09-15 08:34:22,791 : INFO : topic #2 (0.226): 0.030*"billion" + 0.024*"bank" + 0.019*"dollar" + 0.016*"dlrs" + 0.014*"official" + 0.012*"japanese" + 0.010*"february" + 0.010*"market" + 0.010*"january" + 0.009*"trade"
2018-09-15 08:34:22,792 : INFO : topic diff=0.258099, rho=0.230429
2018-09-15 08:34:22,793 : INFO : PROGRESS: pass 2, at document #90/475
2018-09-15 08:34:22,820 : INFO : optimized alpha [0.2803443, 0.2376004, 0.21250975]
2018-09-15 08:34:22,821 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:22,824 : INFO : topic #0 (0.280): 0.033*"trade" + 0.014*"would" + 0.011*"country" + 0.011*"japan" + 0.009*"year" + 0.007*"state" + 0.007*"agreement" + 0.007*"foreign" + 0.006*"minister" + 0.006*"new"
2018-09-15 08:34:22,825 : INFO : topic

2018-09-15 08:34:23,138 : INFO : topic diff=0.225510, rho=0.230429
2018-09-15 08:34:23,140 : INFO : PROGRESS: pass 2, at document #330/475
2018-09-15 08:34:23,161 : INFO : optimized alpha [0.2673573, 0.22322062, 0.22839516]
2018-09-15 08:34:23,163 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:23,169 : INFO : topic #0 (0.267): 0.032*"trade" + 0.017*"would" + 0.015*"japan" + 0.008*"world" + 0.008*"country" + 0.008*"year" + 0.007*"agreement" + 0.007*"import" + 0.006*"minister" + 0.006*"foreign"
2018-09-15 08:34:23,171 : INFO : topic #1 (0.223): 0.044*"oil" + 0.025*"mln" + 0.019*"price" + 0.017*"dlrs" + 0.016*"barrel" + 0.014*"crude" + 0.013*"opec" + 0.012*"stg" + 0.012*"pct" + 0.010*"ecuador"
2018-09-15 08:34:23,179 : INFO : topic #2 (0.228): 0.026*"billion" + 0.023*"bank" + 0.018*"dollar" + 0.016*"dlrs" + 0.012*"market" + 0.011*"japanese" + 0.011*"official" + 0.010*"pct" + 0.009*"currency" + 0.009*"trade"
2018-09-15 08:34:23,180 : INFO : topic 

2018-09-15 08:34:23,496 : INFO : topic #1 (0.202): 0.047*"oil" + 0.026*"mln" + 0.023*"price" + 0.019*"opec" + 0.017*"bpd" + 0.016*"dlrs" + 0.015*"crude" + 0.013*"barrel" + 0.011*"saudi" + 0.010*"company"
2018-09-15 08:34:23,498 : INFO : topic #2 (0.187): 0.034*"billion" + 0.027*"bank" + 0.018*"dollar" + 0.016*"dlrs" + 0.011*"official" + 0.011*"trade" + 0.011*"year" + 0.010*"market" + 0.010*"export" + 0.010*"mln"
2018-09-15 08:34:23,499 : INFO : topic diff=0.216634, rho=0.224544
2018-09-15 08:34:23,502 : INFO : PROGRESS: pass 3, at document #120/475
2018-09-15 08:34:23,524 : INFO : optimized alpha [0.24203055, 0.20026536, 0.19727542]
2018-09-15 08:34:23,525 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:23,527 : INFO : topic #0 (0.242): 0.031*"trade" + 0.014*"would" + 0.012*"japan" + 0.011*"country" + 0.009*"year" + 0.007*"agreement" + 0.007*"minister" + 0.006*"foreign" + 0.006*"state" + 0.006*"meeting"
2018-09-15 08:34:23,528 : INFO : topic #1

2018-09-15 08:34:23,855 : INFO : topic diff=0.274380, rho=0.224544
2018-09-15 08:34:23,855 : INFO : PROGRESS: pass 3, at document #360/475
2018-09-15 08:34:23,881 : INFO : optimized alpha [0.2560795, 0.21357946, 0.20640458]
2018-09-15 08:34:23,881 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:23,886 : INFO : topic #0 (0.256): 0.034*"trade" + 0.017*"would" + 0.015*"japan" + 0.009*"country" + 0.008*"year" + 0.008*"world" + 0.007*"minister" + 0.007*"market" + 0.006*"state" + 0.006*"agreement"
2018-09-15 08:34:23,887 : INFO : topic #1 (0.214): 0.044*"oil" + 0.032*"mln" + 0.017*"price" + 0.017*"bpd" + 0.016*"dlrs" + 0.015*"crude" + 0.014*"barrel" + 0.012*"opec" + 0.011*"pct" + 0.010*"stg"
2018-09-15 08:34:23,888 : INFO : topic #2 (0.206): 0.026*"bank" + 0.024*"billion" + 0.017*"dollar" + 0.013*"dlrs" + 0.013*"market" + 0.013*"currency" + 0.011*"pct" + 0.009*"year" + 0.009*"foreign" + 0.009*"official"
2018-09-15 08:34:23,889 : INFO : topic diff=0.2

2018-09-15 08:34:24,203 : INFO : topic #1 (0.189): 0.048*"oil" + 0.024*"price" + 0.024*"mln" + 0.020*"opec" + 0.017*"dlrs" + 0.016*"bpd" + 0.016*"crude" + 0.013*"barrel" + 0.010*"saudi" + 0.009*"company"
2018-09-15 08:34:24,204 : INFO : topic #2 (0.192): 0.032*"bank" + 0.031*"billion" + 0.020*"dollar" + 0.016*"dlrs" + 0.012*"currency" + 0.011*"year" + 0.010*"market" + 0.010*"january" + 0.009*"trade" + 0.009*"mln"
2018-09-15 08:34:24,205 : INFO : topic diff=0.226200, rho=0.219089
2018-09-15 08:34:24,207 : INFO : PROGRESS: pass 4, at document #150/475
2018-09-15 08:34:24,227 : INFO : optimized alpha [0.2150606, 0.18915643, 0.18370195]
2018-09-15 08:34:24,228 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:24,231 : INFO : topic #0 (0.215): 0.031*"trade" + 0.014*"would" + 0.013*"japan" + 0.010*"country" + 0.008*"year" + 0.007*"agreement" + 0.007*"state" + 0.007*"minister" + 0.006*"united" + 0.006*"foreign"
2018-09-15 08:34:24,235 : INFO : topic #1 

2018-09-15 08:34:24,535 : INFO : topic diff=0.233257, rho=0.219089
2018-09-15 08:34:24,535 : INFO : PROGRESS: pass 4, at document #390/475
2018-09-15 08:34:24,559 : INFO : optimized alpha [0.25514513, 0.20413396, 0.20976998]
2018-09-15 08:34:24,560 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:24,563 : INFO : topic #0 (0.255): 0.032*"trade" + 0.017*"would" + 0.016*"japan" + 0.009*"country" + 0.008*"japanese" + 0.008*"year" + 0.008*"market" + 0.007*"state" + 0.007*"world" + 0.007*"agreement"
2018-09-15 08:34:24,563 : INFO : topic #1 (0.204): 0.044*"oil" + 0.030*"mln" + 0.021*"price" + 0.017*"opec" + 0.016*"crude" + 0.016*"bpd" + 0.014*"dlrs" + 0.013*"barrel" + 0.010*"pct" + 0.010*"production"
2018-09-15 08:34:24,565 : INFO : topic #2 (0.210): 0.025*"billion" + 0.025*"bank" + 0.015*"dlrs" + 0.014*"dollar" + 0.014*"currency" + 0.014*"market" + 0.013*"pct" + 0.010*"foreign" + 0.009*"year" + 0.009*"mln"
2018-09-15 08:34:24,567 : INFO : topic diff=

2018-09-15 08:34:24,881 : INFO : topic #1 (0.186): 0.048*"oil" + 0.024*"mln" + 0.024*"price" + 0.023*"opec" + 0.017*"dlrs" + 0.016*"bpd" + 0.015*"crude" + 0.014*"barrel" + 0.013*"saudi" + 0.008*"pct"
2018-09-15 08:34:24,882 : INFO : topic #2 (0.183): 0.035*"billion" + 0.028*"bank" + 0.017*"dlrs" + 0.016*"dollar" + 0.011*"year" + 0.011*"currency" + 0.010*"trade" + 0.010*"january" + 0.010*"market" + 0.010*"mln"
2018-09-15 08:34:24,883 : INFO : topic diff=0.199340, rho=0.214013
2018-09-15 08:34:24,884 : INFO : PROGRESS: pass 5, at document #180/475
2018-09-15 08:34:24,907 : INFO : optimized alpha [0.22376107, 0.18223973, 0.18968318]
2018-09-15 08:34:24,909 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:24,912 : INFO : topic #0 (0.224): 0.029*"trade" + 0.016*"japan" + 0.014*"would" + 0.009*"country" + 0.008*"year" + 0.007*"japanese" + 0.007*"agreement" + 0.007*"minister" + 0.006*"foreign" + 0.006*"market"
2018-09-15 08:34:24,913 : INFO : topic #1 

2018-09-15 08:34:25,201 : INFO : topic diff=0.223563, rho=0.214013
2018-09-15 08:34:25,202 : INFO : PROGRESS: pass 5, at document #420/475
2018-09-15 08:34:25,223 : INFO : optimized alpha [0.23510785, 0.19366887, 0.20420533]
2018-09-15 08:34:25,226 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:25,230 : INFO : topic #0 (0.235): 0.032*"trade" + 0.016*"japan" + 0.015*"would" + 0.010*"japanese" + 0.008*"year" + 0.008*"country" + 0.008*"market" + 0.007*"agreement" + 0.007*"minister" + 0.007*"state"
2018-09-15 08:34:25,234 : INFO : topic #1 (0.194): 0.047*"oil" + 0.027*"mln" + 0.021*"price" + 0.017*"crude" + 0.017*"opec" + 0.014*"dlrs" + 0.014*"bpd" + 0.014*"barrel" + 0.009*"production" + 0.009*"ecuador"
2018-09-15 08:34:25,235 : INFO : topic #2 (0.204): 0.024*"billion" + 0.023*"bank" + 0.016*"dlrs" + 0.015*"dollar" + 0.015*"market" + 0.014*"currency" + 0.013*"pct" + 0.011*"mln" + 0.010*"exchange" + 0.009*"year"
2018-09-15 08:34:25,235 : INFO : top

2018-09-15 08:34:25,545 : INFO : topic #0 (0.226): 0.029*"trade" + 0.016*"japan" + 0.014*"would" + 0.009*"country" + 0.008*"japanese" + 0.008*"year" + 0.007*"agreement" + 0.007*"minister" + 0.006*"official" + 0.006*"foreign"
2018-09-15 08:34:25,546 : INFO : topic #1 (0.183): 0.047*"oil" + 0.026*"mln" + 0.023*"price" + 0.022*"opec" + 0.016*"bpd" + 0.015*"barrel" + 0.015*"crude" + 0.014*"dlrs" + 0.010*"saudi" + 0.009*"pct"
2018-09-15 08:34:25,547 : INFO : topic #2 (0.191): 0.032*"billion" + 0.026*"bank" + 0.021*"dollar" + 0.016*"dlrs" + 0.014*"currency" + 0.013*"exchange" + 0.012*"market" + 0.010*"january" + 0.010*"rate" + 0.010*"year"
2018-09-15 08:34:25,548 : INFO : topic diff=0.284765, rho=0.209274
2018-09-15 08:34:25,549 : INFO : PROGRESS: pass 6, at document #210/475
2018-09-15 08:34:25,572 : INFO : optimized alpha [0.22978464, 0.17994785, 0.19752899]
2018-09-15 08:34:25,573 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:25,576 : INFO : top

2018-09-15 08:34:25,874 : INFO : topic #2 (0.207): 0.025*"bank" + 0.024*"billion" + 0.015*"dollar" + 0.015*"dlrs" + 0.015*"market" + 0.014*"currency" + 0.013*"pct" + 0.011*"mln" + 0.010*"exchange" + 0.010*"rate"
2018-09-15 08:34:25,875 : INFO : topic diff=0.164621, rho=0.209274
2018-09-15 08:34:25,875 : INFO : PROGRESS: pass 6, at document #450/475
2018-09-15 08:34:25,896 : INFO : optimized alpha [0.24427487, 0.1898646, 0.20707892]
2018-09-15 08:34:25,902 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:25,905 : INFO : topic #0 (0.244): 0.031*"trade" + 0.015*"would" + 0.014*"japan" + 0.009*"country" + 0.008*"year" + 0.008*"japanese" + 0.007*"agreement" + 0.007*"world" + 0.007*"state" + 0.007*"official"
2018-09-15 08:34:25,906 : INFO : topic #1 (0.190): 0.049*"oil" + 0.024*"price" + 0.023*"mln" + 0.017*"dlrs" + 0.016*"opec" + 0.016*"bpd" + 0.015*"crude" + 0.014*"barrel" + 0.012*"company" + 0.010*"pct"
2018-09-15 08:34:25,906 : INFO : topic #2 (0.

2018-09-15 08:34:26,209 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:26,214 : INFO : topic #0 (0.229): 0.029*"trade" + 0.016*"japan" + 0.015*"would" + 0.010*"country" + 0.009*"minister" + 0.008*"japanese" + 0.008*"agreement" + 0.007*"official" + 0.007*"year" + 0.007*"market"
2018-09-15 08:34:26,216 : INFO : topic #1 (0.180): 0.047*"oil" + 0.027*"mln" + 0.023*"price" + 0.020*"opec" + 0.014*"dlrs" + 0.014*"bpd" + 0.014*"crude" + 0.014*"barrel" + 0.009*"saudi" + 0.009*"production"
2018-09-15 08:34:26,218 : INFO : topic #2 (0.197): 0.028*"billion" + 0.024*"bank" + 0.020*"dollar" + 0.015*"currency" + 0.015*"market" + 0.013*"rate" + 0.013*"dlrs" + 0.013*"exchange" + 0.011*"pct" + 0.009*"january"
2018-09-15 08:34:26,219 : INFO : topic diff=0.220877, rho=0.204837
2018-09-15 08:34:26,223 : INFO : PROGRESS: pass 7, at document #240/475
2018-09-15 08:34:26,241 : INFO : optimized alpha [0.2438147, 0.18016411, 0.20712359]
2018-09-15 08:34:26,242 : INFO :

2018-09-15 08:34:26,556 : INFO : topic #2 (0.205): 0.028*"billion" + 0.025*"bank" + 0.015*"dlrs" + 0.014*"currency" + 0.014*"market" + 0.014*"dollar" + 0.012*"pct" + 0.011*"exchange" + 0.010*"rate" + 0.010*"foreign"
2018-09-15 08:34:26,557 : INFO : topic diff=0.260118, rho=0.204837
2018-09-15 08:34:26,586 : INFO : -6.898 per-word bound, 119.3 perplexity estimate based on a held-out corpus of 25 documents with 2811 words
2018-09-15 08:34:26,587 : INFO : PROGRESS: pass 7, at document #475/475
2018-09-15 08:34:26,603 : INFO : optimized alpha [0.22961736, 0.17759277, 0.19078346]
2018-09-15 08:34:26,604 : INFO : merging changes from 25 documents into a model of 475 documents
2018-09-15 08:34:26,607 : INFO : topic #0 (0.230): 0.033*"trade" + 0.015*"would" + 0.013*"japan" + 0.010*"country" + 0.009*"japanese" + 0.008*"year" + 0.007*"state" + 0.007*"agreement" + 0.007*"official" + 0.007*"world"
2018-09-15 08:34:26,608 : INFO : topic #1 (0.178): 0.049*"oil" + 0.024*"price" + 0.023*"mln" + 0.018*

2018-09-15 08:34:26,888 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:26,890 : INFO : topic #0 (0.239): 0.031*"trade" + 0.018*"japan" + 0.015*"would" + 0.009*"japanese" + 0.009*"country" + 0.008*"minister" + 0.008*"official" + 0.008*"agreement" + 0.007*"year" + 0.007*"import"
2018-09-15 08:34:26,891 : INFO : topic #1 (0.177): 0.047*"oil" + 0.028*"mln" + 0.024*"price" + 0.018*"opec" + 0.015*"dlrs" + 0.014*"crude" + 0.014*"barrel" + 0.012*"bpd" + 0.010*"saudi" + 0.009*"production"
2018-09-15 08:34:26,892 : INFO : topic #2 (0.202): 0.027*"billion" + 0.026*"bank" + 0.017*"dollar" + 0.014*"exchange" + 0.013*"market" + 0.013*"rate" + 0.013*"currency" + 0.013*"dlrs" + 0.011*"pct" + 0.009*"foreign"
2018-09-15 08:34:26,893 : INFO : topic diff=0.205579, rho=0.200670
2018-09-15 08:34:26,894 : INFO : PROGRESS: pass 8, at document #270/475
2018-09-15 08:34:26,920 : INFO : optimized alpha [0.23664843, 0.17972754, 0.19970568]
2018-09-15 08:34:26,920 : INFO 

2018-09-15 08:34:27,245 : INFO : topic #1 (0.176): 0.049*"oil" + 0.024*"price" + 0.023*"mln" + 0.018*"crude" + 0.017*"dlrs" + 0.017*"opec" + 0.016*"bpd" + 0.014*"barrel" + 0.011*"company" + 0.010*"pct"
2018-09-15 08:34:27,246 : INFO : topic #2 (0.187): 0.032*"bank" + 0.029*"billion" + 0.015*"market" + 0.015*"dollar" + 0.013*"currency" + 0.013*"exchange" + 0.013*"rate" + 0.013*"dlrs" + 0.011*"pct" + 0.009*"year"
2018-09-15 08:34:27,247 : INFO : topic diff=0.200897, rho=0.200670
2018-09-15 08:34:27,250 : INFO : PROGRESS: pass 9, at document #30/475
2018-09-15 08:34:27,271 : INFO : optimized alpha [0.23223361, 0.18349162, 0.18461265]
2018-09-15 08:34:27,272 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:27,275 : INFO : topic #0 (0.232): 0.034*"trade" + 0.014*"would" + 0.013*"japan" + 0.011*"country" + 0.009*"japanese" + 0.008*"year" + 0.008*"state" + 0.007*"agreement" + 0.007*"official" + 0.006*"market"
2018-09-15 08:34:27,275 : INFO : topic #1 (

2018-09-15 08:34:27,521 : INFO : PROGRESS: pass 9, at document #270/475
2018-09-15 08:34:27,546 : INFO : optimized alpha [0.23458241, 0.17820185, 0.19734491]
2018-09-15 08:34:27,547 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:27,551 : INFO : topic #0 (0.235): 0.029*"trade" + 0.020*"japan" + 0.015*"would" + 0.010*"japanese" + 0.009*"official" + 0.009*"country" + 0.008*"year" + 0.008*"minister" + 0.008*"world" + 0.008*"agreement"
2018-09-15 08:34:27,552 : INFO : topic #1 (0.178): 0.051*"oil" + 0.028*"mln" + 0.024*"price" + 0.017*"opec" + 0.016*"dlrs" + 0.015*"barrel" + 0.013*"crude" + 0.011*"pct" + 0.011*"saudi" + 0.010*"production"
2018-09-15 08:34:27,553 : INFO : topic #2 (0.197): 0.029*"bank" + 0.026*"billion" + 0.020*"dollar" + 0.015*"market" + 0.014*"currency" + 0.012*"dlrs" + 0.012*"pct" + 0.012*"rate" + 0.011*"exchange" + 0.009*"year"
2018-09-15 08:34:27,554 : INFO : topic diff=0.211518, rho=0.196748
2018-09-15 08:34:27,595 : INFO : -7

2018-09-15 08:34:27,902 : INFO : topic #0 (0.230): 0.034*"trade" + 0.014*"would" + 0.013*"japan" + 0.011*"country" + 0.009*"japanese" + 0.008*"year" + 0.008*"state" + 0.007*"agreement" + 0.007*"official" + 0.006*"market"
2018-09-15 08:34:27,903 : INFO : topic #1 (0.180): 0.046*"oil" + 0.024*"price" + 0.024*"mln" + 0.019*"opec" + 0.017*"bpd" + 0.017*"crude" + 0.016*"dlrs" + 0.013*"barrel" + 0.009*"pct" + 0.009*"company"
2018-09-15 08:34:27,904 : INFO : topic #2 (0.185): 0.031*"bank" + 0.029*"billion" + 0.016*"market" + 0.014*"currency" + 0.014*"dollar" + 0.013*"dlrs" + 0.012*"exchange" + 0.012*"rate" + 0.010*"china" + 0.010*"pct"
2018-09-15 08:34:27,905 : INFO : topic diff=0.183638, rho=0.193047
2018-09-15 08:34:27,905 : INFO : PROGRESS: pass 10, at document #60/475
2018-09-15 08:34:27,927 : INFO : optimized alpha [0.2318227, 0.17305416, 0.18928438]
2018-09-15 08:34:27,928 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:27,930 : INFO : topic #0 

2018-09-15 08:34:28,217 : INFO : topic diff=0.207068, rho=0.193047
2018-09-15 08:34:28,269 : INFO : -7.078 per-word bound, 135.1 perplexity estimate based on a held-out corpus of 30 documents with 2726 words
2018-09-15 08:34:28,271 : INFO : PROGRESS: pass 10, at document #300/475
2018-09-15 08:34:28,313 : INFO : optimized alpha [0.23163183, 0.18796171, 0.20120676]
2018-09-15 08:34:28,316 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:28,320 : INFO : topic #0 (0.232): 0.031*"trade" + 0.020*"japan" + 0.015*"would" + 0.011*"japanese" + 0.009*"official" + 0.009*"world" + 0.008*"country" + 0.008*"year" + 0.007*"minister" + 0.007*"market"
2018-09-15 08:34:28,321 : INFO : topic #1 (0.188): 0.051*"oil" + 0.026*"mln" + 0.023*"price" + 0.017*"dlrs" + 0.016*"opec" + 0.014*"barrel" + 0.013*"crude" + 0.011*"pct" + 0.011*"saudi" + 0.010*"stg"
2018-09-15 08:34:28,323 : INFO : topic #2 (0.201): 0.031*"bank" + 0.026*"billion" + 0.019*"dollar" + 0.015*"market" 

2018-09-15 08:34:28,652 : INFO : topic #0 (0.231): 0.032*"trade" + 0.014*"japan" + 0.013*"would" + 0.012*"country" + 0.009*"japanese" + 0.008*"year" + 0.008*"state" + 0.008*"official" + 0.007*"agreement" + 0.007*"world"
2018-09-15 08:34:28,653 : INFO : topic #1 (0.171): 0.045*"oil" + 0.024*"mln" + 0.023*"price" + 0.019*"opec" + 0.018*"bpd" + 0.017*"crude" + 0.016*"dlrs" + 0.013*"barrel" + 0.010*"saudi" + 0.009*"ecuador"
2018-09-15 08:34:28,655 : INFO : topic #2 (0.191): 0.033*"bank" + 0.028*"billion" + 0.018*"dollar" + 0.015*"market" + 0.014*"currency" + 0.012*"dlrs" + 0.012*"rate" + 0.012*"exchange" + 0.011*"mln" + 0.010*"pct"
2018-09-15 08:34:28,656 : INFO : topic diff=0.190628, rho=0.189547
2018-09-15 08:34:28,657 : INFO : PROGRESS: pass 11, at document #90/475
2018-09-15 08:34:28,681 : INFO : optimized alpha [0.22205293, 0.17586853, 0.18608007]
2018-09-15 08:34:28,682 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:28,689 : INFO : topic #0 

2018-09-15 08:34:29,001 : INFO : topic #2 (0.195): 0.031*"bank" + 0.027*"billion" + 0.019*"dollar" + 0.016*"market" + 0.013*"dlrs" + 0.013*"currency" + 0.012*"rate" + 0.011*"exchange" + 0.011*"pct" + 0.010*"mln"
2018-09-15 08:34:29,002 : INFO : topic diff=0.170372, rho=0.189547
2018-09-15 08:34:29,002 : INFO : PROGRESS: pass 11, at document #330/475
2018-09-15 08:34:29,024 : INFO : optimized alpha [0.22588046, 0.18562885, 0.19711648]
2018-09-15 08:34:29,026 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:29,030 : INFO : topic #0 (0.226): 0.032*"trade" + 0.018*"japan" + 0.016*"would" + 0.010*"japanese" + 0.009*"official" + 0.008*"world" + 0.008*"country" + 0.008*"year" + 0.007*"import" + 0.007*"state"
2018-09-15 08:34:29,031 : INFO : topic #1 (0.186): 0.047*"oil" + 0.024*"mln" + 0.021*"price" + 0.019*"dlrs" + 0.016*"barrel" + 0.015*"crude" + 0.014*"opec" + 0.012*"stg" + 0.011*"pct" + 0.011*"bpd"
2018-09-15 08:34:29,034 : INFO : topic #2 (0.197):

2018-09-15 08:34:29,346 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:29,349 : INFO : topic #0 (0.220): 0.033*"trade" + 0.014*"would" + 0.013*"japan" + 0.012*"country" + 0.009*"year" + 0.009*"japanese" + 0.008*"state" + 0.007*"official" + 0.007*"agreement" + 0.006*"united"
2018-09-15 08:34:29,350 : INFO : topic #1 (0.174): 0.048*"oil" + 0.024*"mln" + 0.024*"price" + 0.019*"opec" + 0.018*"dlrs" + 0.017*"bpd" + 0.016*"crude" + 0.014*"barrel" + 0.011*"saudi" + 0.011*"company"
2018-09-15 08:34:29,351 : INFO : topic #2 (0.185): 0.033*"bank" + 0.030*"billion" + 0.016*"dollar" + 0.015*"market" + 0.013*"mln" + 0.012*"dlrs" + 0.012*"currency" + 0.011*"exchange" + 0.011*"rate" + 0.010*"pct"
2018-09-15 08:34:29,353 : INFO : topic diff=0.174493, rho=0.186231
2018-09-15 08:34:29,354 : INFO : PROGRESS: pass 12, at document #120/475
2018-09-15 08:34:29,378 : INFO : optimized alpha [0.21824881, 0.17461263, 0.19181982]
2018-09-15 08:34:29,380 : INFO : merging

2018-09-15 08:34:29,681 : INFO : topic #2 (0.196): 0.030*"bank" + 0.024*"billion" + 0.018*"dollar" + 0.016*"market" + 0.016*"mln" + 0.013*"currency" + 0.012*"rate" + 0.012*"dlrs" + 0.012*"pct" + 0.011*"exchange"
2018-09-15 08:34:29,683 : INFO : topic diff=0.222397, rho=0.186231
2018-09-15 08:34:29,684 : INFO : PROGRESS: pass 12, at document #360/475
2018-09-15 08:34:29,712 : INFO : optimized alpha [0.22597054, 0.1701749, 0.19451758]
2018-09-15 08:34:29,721 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:29,724 : INFO : topic #0 (0.226): 0.034*"trade" + 0.017*"japan" + 0.016*"would" + 0.010*"japanese" + 0.009*"official" + 0.009*"country" + 0.008*"year" + 0.008*"world" + 0.007*"minister" + 0.007*"market"
2018-09-15 08:34:29,725 : INFO : topic #1 (0.170): 0.047*"oil" + 0.025*"mln" + 0.020*"price" + 0.017*"dlrs" + 0.017*"bpd" + 0.016*"crude" + 0.015*"barrel" + 0.014*"opec" + 0.010*"pct" + 0.009*"saudi"
2018-09-15 08:34:29,726 : INFO : topic #2 (0.1

2018-09-15 08:34:30,035 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:30,037 : INFO : topic #0 (0.209): 0.032*"trade" + 0.014*"japan" + 0.014*"would" + 0.011*"country" + 0.009*"year" + 0.008*"japanese" + 0.007*"state" + 0.007*"official" + 0.007*"agreement" + 0.007*"minister"
2018-09-15 08:34:30,038 : INFO : topic #1 (0.153): 0.050*"oil" + 0.026*"price" + 0.020*"opec" + 0.018*"dlrs" + 0.018*"mln" + 0.017*"bpd" + 0.017*"crude" + 0.013*"barrel" + 0.011*"saudi" + 0.010*"company"
2018-09-15 08:34:30,039 : INFO : topic #2 (0.188): 0.034*"bank" + 0.027*"billion" + 0.017*"dollar" + 0.017*"mln" + 0.015*"market" + 0.012*"currency" + 0.012*"rate" + 0.012*"exchange" + 0.012*"dlrs" + 0.010*"stg"
2018-09-15 08:34:30,040 : INFO : topic diff=0.178272, rho=0.183083
2018-09-15 08:34:30,042 : INFO : PROGRESS: pass 13, at document #150/475
2018-09-15 08:34:30,069 : INFO : optimized alpha [0.19728686, 0.15099974, 0.1775455]
2018-09-15 08:34:30,069 : INFO : mergin

2018-09-15 08:34:30,350 : INFO : topic #2 (0.188): 0.031*"bank" + 0.023*"billion" + 0.020*"mln" + 0.017*"market" + 0.017*"dollar" + 0.014*"currency" + 0.013*"rate" + 0.013*"pct" + 0.012*"exchange" + 0.012*"stg"
2018-09-15 08:34:30,351 : INFO : topic diff=0.191450, rho=0.183083
2018-09-15 08:34:30,351 : INFO : PROGRESS: pass 13, at document #390/475
2018-09-15 08:34:30,375 : INFO : optimized alpha [0.22368608, 0.14991932, 0.18622805]
2018-09-15 08:34:30,376 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:30,379 : INFO : topic #0 (0.224): 0.032*"trade" + 0.017*"japan" + 0.016*"would" + 0.012*"japanese" + 0.009*"country" + 0.009*"official" + 0.009*"year" + 0.008*"market" + 0.008*"state" + 0.007*"world"
2018-09-15 08:34:30,382 : INFO : topic #1 (0.150): 0.047*"oil" + 0.023*"price" + 0.021*"mln" + 0.018*"opec" + 0.016*"crude" + 0.016*"dlrs" + 0.016*"bpd" + 0.014*"barrel" + 0.010*"production" + 0.009*"pct"
2018-09-15 08:34:30,383 : INFO : topic #2 (0

2018-09-15 08:34:30,696 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:30,701 : INFO : topic #0 (0.191): 0.032*"trade" + 0.015*"japan" + 0.014*"would" + 0.010*"country" + 0.009*"year" + 0.009*"japanese" + 0.008*"official" + 0.007*"state" + 0.007*"agreement" + 0.007*"foreign"
2018-09-15 08:34:30,702 : INFO : topic #1 (0.140): 0.049*"oil" + 0.026*"price" + 0.023*"opec" + 0.018*"dlrs" + 0.018*"mln" + 0.016*"bpd" + 0.016*"crude" + 0.014*"barrel" + 0.013*"saudi" + 0.009*"company"
2018-09-15 08:34:30,703 : INFO : topic #2 (0.173): 0.031*"bank" + 0.031*"billion" + 0.020*"mln" + 0.016*"dollar" + 0.014*"market" + 0.013*"dlrs" + 0.012*"rate" + 0.011*"currency" + 0.011*"exchange" + 0.011*"stg"
2018-09-15 08:34:30,704 : INFO : topic diff=0.165003, rho=0.180090
2018-09-15 08:34:30,705 : INFO : PROGRESS: pass 14, at document #180/475
2018-09-15 08:34:30,732 : INFO : optimized alpha [0.20049009, 0.14091803, 0.17957374]
2018-09-15 08:34:30,733 : INFO : mergin

2018-09-15 08:34:31,022 : INFO : topic #2 (0.181): 0.031*"bank" + 0.024*"billion" + 0.020*"mln" + 0.017*"market" + 0.015*"dollar" + 0.015*"currency" + 0.013*"pct" + 0.012*"rate" + 0.011*"stg" + 0.011*"exchange"
2018-09-15 08:34:31,023 : INFO : topic diff=0.185602, rho=0.180090
2018-09-15 08:34:31,024 : INFO : PROGRESS: pass 14, at document #420/475
2018-09-15 08:34:31,046 : INFO : optimized alpha [0.20270132, 0.14017907, 0.17517725]
2018-09-15 08:34:31,048 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:31,050 : INFO : topic #0 (0.203): 0.032*"trade" + 0.017*"japan" + 0.015*"would" + 0.012*"japanese" + 0.008*"market" + 0.008*"year" + 0.008*"official" + 0.008*"country" + 0.007*"agreement" + 0.007*"state"
2018-09-15 08:34:31,052 : INFO : topic #1 (0.140): 0.048*"oil" + 0.023*"price" + 0.019*"mln" + 0.017*"opec" + 0.017*"crude" + 0.016*"dlrs" + 0.015*"bpd" + 0.014*"barrel" + 0.010*"production" + 0.010*"company"
2018-09-15 08:34:31,052 : INFO : top

2018-09-15 08:34:31,369 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:31,372 : INFO : topic #0 (0.196): 0.030*"trade" + 0.016*"japan" + 0.013*"would" + 0.009*"country" + 0.009*"japanese" + 0.008*"year" + 0.008*"official" + 0.007*"market" + 0.007*"agreement" + 0.007*"foreign"
2018-09-15 08:34:31,375 : INFO : topic #1 (0.137): 0.047*"oil" + 0.025*"price" + 0.022*"opec" + 0.018*"mln" + 0.016*"bpd" + 0.016*"dlrs" + 0.015*"barrel" + 0.015*"crude" + 0.010*"saudi" + 0.009*"company"
2018-09-15 08:34:31,378 : INFO : topic #2 (0.174): 0.030*"billion" + 0.029*"bank" + 0.020*"dollar" + 0.018*"mln" + 0.015*"market" + 0.014*"exchange" + 0.014*"currency" + 0.013*"rate" + 0.013*"dlrs" + 0.011*"pct"
2018-09-15 08:34:31,380 : INFO : topic diff=0.232440, rho=0.177239
2018-09-15 08:34:31,381 : INFO : PROGRESS: pass 15, at document #210/475
2018-09-15 08:34:31,402 : INFO : optimized alpha [0.19930595, 0.13497975, 0.1777398]
2018-09-15 08:34:31,405 : INFO : mergin

2018-09-15 08:34:31,683 : INFO : topic #2 (0.171): 0.028*"bank" + 0.023*"billion" + 0.022*"mln" + 0.018*"market" + 0.016*"dollar" + 0.014*"currency" + 0.013*"pct" + 0.013*"rate" + 0.012*"exchange" + 0.012*"dlrs"
2018-09-15 08:34:31,684 : INFO : topic diff=0.139835, rho=0.177239
2018-09-15 08:34:31,685 : INFO : PROGRESS: pass 15, at document #450/475
2018-09-15 08:34:31,706 : INFO : optimized alpha [0.21152513, 0.13719553, 0.17524984]
2018-09-15 08:34:31,707 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:31,710 : INFO : topic #0 (0.212): 0.031*"trade" + 0.015*"would" + 0.014*"japan" + 0.010*"japanese" + 0.009*"country" + 0.009*"year" + 0.008*"official" + 0.007*"market" + 0.007*"agreement" + 0.007*"state"
2018-09-15 08:34:31,711 : INFO : topic #1 (0.137): 0.049*"oil" + 0.026*"price" + 0.018*"dlrs" + 0.017*"opec" + 0.016*"mln" + 0.016*"bpd" + 0.015*"crude" + 0.014*"barrel" + 0.012*"company" + 0.009*"pct"
2018-09-15 08:34:31,712 : INFO : topic #2 

2018-09-15 08:34:32,018 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:32,020 : INFO : topic #0 (0.197): 0.030*"trade" + 0.016*"japan" + 0.014*"would" + 0.010*"japanese" + 0.010*"country" + 0.008*"minister" + 0.008*"official" + 0.008*"agreement" + 0.007*"market" + 0.007*"year"
2018-09-15 08:34:32,021 : INFO : topic #1 (0.133): 0.048*"oil" + 0.025*"price" + 0.020*"opec" + 0.018*"mln" + 0.015*"dlrs" + 0.015*"bpd" + 0.014*"crude" + 0.014*"barrel" + 0.009*"saudi" + 0.009*"production"
2018-09-15 08:34:32,022 : INFO : topic #2 (0.172): 0.027*"billion" + 0.026*"bank" + 0.019*"dollar" + 0.018*"mln" + 0.017*"market" + 0.015*"rate" + 0.014*"currency" + 0.013*"exchange" + 0.012*"pct" + 0.011*"dlrs"
2018-09-15 08:34:32,024 : INFO : topic diff=0.186185, rho=0.174519
2018-09-15 08:34:32,024 : INFO : PROGRESS: pass 16, at document #240/475
2018-09-15 08:34:32,052 : INFO : optimized alpha [0.20434925, 0.13216624, 0.17081794]
2018-09-15 08:34:32,053 : INFO : m

2018-09-15 08:34:32,349 : INFO : topic #2 (0.169): 0.028*"bank" + 0.027*"billion" + 0.019*"mln" + 0.017*"market" + 0.014*"currency" + 0.014*"dollar" + 0.013*"pct" + 0.012*"exchange" + 0.012*"rate" + 0.011*"dlrs"
2018-09-15 08:34:32,350 : INFO : topic diff=0.219125, rho=0.174519
2018-09-15 08:34:32,376 : INFO : -6.833 per-word bound, 114.0 perplexity estimate based on a held-out corpus of 25 documents with 2811 words
2018-09-15 08:34:32,376 : INFO : PROGRESS: pass 16, at document #475/475
2018-09-15 08:34:32,392 : INFO : optimized alpha [0.19964509, 0.12891379, 0.1594953]
2018-09-15 08:34:32,394 : INFO : merging changes from 25 documents into a model of 475 documents
2018-09-15 08:34:32,398 : INFO : topic #0 (0.200): 0.033*"trade" + 0.014*"japan" + 0.014*"would" + 0.010*"country" + 0.010*"japanese" + 0.008*"year" + 0.008*"state" + 0.007*"official" + 0.007*"agreement" + 0.007*"market"
2018-09-15 08:34:32,400 : INFO : topic #1 (0.129): 0.049*"oil" + 0.026*"price" + 0.018*"dlrs" + 0.017*"c

2018-09-15 08:34:32,675 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:32,679 : INFO : topic #0 (0.204): 0.030*"trade" + 0.017*"japan" + 0.014*"would" + 0.010*"japanese" + 0.009*"country" + 0.008*"official" + 0.008*"minister" + 0.008*"year" + 0.007*"agreement" + 0.007*"import"
2018-09-15 08:34:32,680 : INFO : topic #1 (0.131): 0.048*"oil" + 0.025*"price" + 0.019*"mln" + 0.018*"opec" + 0.016*"dlrs" + 0.014*"crude" + 0.014*"barrel" + 0.013*"bpd" + 0.010*"saudi" + 0.010*"production"
2018-09-15 08:34:32,685 : INFO : topic #2 (0.167): 0.029*"bank" + 0.027*"billion" + 0.018*"mln" + 0.017*"dollar" + 0.016*"market" + 0.015*"rate" + 0.015*"exchange" + 0.014*"currency" + 0.011*"pct" + 0.011*"dlrs"
2018-09-15 08:34:32,687 : INFO : topic diff=0.168886, rho=0.171920
2018-09-15 08:34:32,687 : INFO : PROGRESS: pass 17, at document #270/475
2018-09-15 08:34:32,711 : INFO : optimized alpha [0.20182486, 0.13115247, 0.1627024]
2018-09-15 08:34:32,714 : INFO : me

2018-09-15 08:34:33,016 : INFO : topic #1 (0.128): 0.049*"oil" + 0.026*"price" + 0.018*"dlrs" + 0.017*"crude" + 0.017*"opec" + 0.016*"mln" + 0.015*"bpd" + 0.014*"barrel" + 0.011*"company" + 0.010*"pct"
2018-09-15 08:34:33,017 : INFO : topic #2 (0.158): 0.032*"bank" + 0.026*"billion" + 0.018*"mln" + 0.017*"market" + 0.015*"dollar" + 0.014*"rate" + 0.013*"currency" + 0.013*"exchange" + 0.011*"pct" + 0.010*"stg"
2018-09-15 08:34:33,017 : INFO : topic diff=0.174310, rho=0.171920
2018-09-15 08:34:33,018 : INFO : PROGRESS: pass 18, at document #30/475
2018-09-15 08:34:33,039 : INFO : optimized alpha [0.20125322, 0.13114835, 0.15523773]
2018-09-15 08:34:33,044 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:33,048 : INFO : topic #0 (0.201): 0.034*"trade" + 0.013*"japan" + 0.013*"would" + 0.011*"country" + 0.009*"japanese" + 0.008*"year" + 0.008*"state" + 0.007*"agreement" + 0.007*"official" + 0.007*"market"
2018-09-15 08:34:33,049 : INFO : topic #1 (0

2018-09-15 08:34:33,331 : INFO : optimized alpha [0.2010857, 0.13029647, 0.15958515]
2018-09-15 08:34:33,332 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:33,335 : INFO : topic #0 (0.201): 0.029*"trade" + 0.019*"japan" + 0.014*"would" + 0.011*"japanese" + 0.009*"official" + 0.009*"country" + 0.009*"year" + 0.008*"market" + 0.007*"minister" + 0.007*"agreement"
2018-09-15 08:34:33,336 : INFO : topic #1 (0.130): 0.050*"oil" + 0.025*"price" + 0.018*"mln" + 0.018*"opec" + 0.017*"dlrs" + 0.015*"barrel" + 0.014*"crude" + 0.011*"pct" + 0.011*"saudi" + 0.010*"bpd"
2018-09-15 08:34:33,337 : INFO : topic #2 (0.160): 0.031*"bank" + 0.026*"billion" + 0.021*"dollar" + 0.019*"mln" + 0.016*"market" + 0.014*"currency" + 0.013*"rate" + 0.012*"exchange" + 0.012*"pct" + 0.011*"stg"
2018-09-15 08:34:33,338 : INFO : topic diff=0.181038, rho=0.169435
2018-09-15 08:34:33,376 : INFO : -7.044 per-word bound, 131.9 perplexity estimate based on a held-out corpus of 30 d

2018-09-15 08:34:33,677 : INFO : topic #1 (0.130): 0.046*"oil" + 0.025*"price" + 0.018*"opec" + 0.017*"dlrs" + 0.017*"crude" + 0.017*"bpd" + 0.017*"mln" + 0.013*"barrel" + 0.009*"company" + 0.009*"pct"
2018-09-15 08:34:33,681 : INFO : topic #2 (0.153): 0.031*"bank" + 0.027*"billion" + 0.020*"mln" + 0.017*"market" + 0.015*"dollar" + 0.014*"currency" + 0.013*"rate" + 0.013*"exchange" + 0.012*"stg" + 0.011*"pct"
2018-09-15 08:34:33,685 : INFO : topic diff=0.155427, rho=0.167054
2018-09-15 08:34:33,687 : INFO : PROGRESS: pass 19, at document #60/475
2018-09-15 08:34:33,702 : INFO : optimized alpha [0.19882703, 0.12604125, 0.16004272]
2018-09-15 08:34:33,703 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:33,705 : INFO : topic #0 (0.199): 0.032*"trade" + 0.015*"japan" + 0.013*"would" + 0.012*"country" + 0.010*"japanese" + 0.008*"year" + 0.008*"state" + 0.008*"official" + 0.007*"market" + 0.007*"agreement"
2018-09-15 08:34:33,706 : INFO : topic #1 (0

2018-09-15 08:34:33,993 : INFO : PROGRESS: pass 19, at document #300/475
2018-09-15 08:34:34,012 : INFO : optimized alpha [0.19917622, 0.13374747, 0.1613215]
2018-09-15 08:34:34,015 : INFO : merging changes from 30 documents into a model of 475 documents
2018-09-15 08:34:34,018 : INFO : topic #0 (0.199): 0.031*"trade" + 0.019*"japan" + 0.014*"would" + 0.011*"japanese" + 0.009*"official" + 0.008*"year" + 0.008*"country" + 0.008*"market" + 0.008*"world" + 0.007*"agreement"
2018-09-15 08:34:34,018 : INFO : topic #1 (0.134): 0.050*"oil" + 0.025*"price" + 0.017*"dlrs" + 0.017*"mln" + 0.016*"opec" + 0.014*"barrel" + 0.013*"crude" + 0.011*"saudi" + 0.011*"pct" + 0.010*"bpd"
2018-09-15 08:34:34,019 : INFO : topic #2 (0.161): 0.032*"bank" + 0.026*"billion" + 0.020*"dollar" + 0.020*"mln" + 0.016*"market" + 0.014*"currency" + 0.013*"rate" + 0.012*"exchange" + 0.012*"stg" + 0.012*"dlrs"
2018-09-15 08:34:34,020 : INFO : topic diff=0.148212, rho=0.167054
2018-09-15 08:34:34,021 : INFO : PROGRESS: pa

Wall time: 13.2 s


In [12]:
# To see the topics, with the most probable words in each topic. What topics to you see? 
lda.show_topics()

[(0,
  '0.033*"trade" + 0.014*"japan" + 0.014*"would" + 0.010*"country" + 0.010*"japanese" + 0.008*"year" + 0.008*"state" + 0.007*"official" + 0.007*"agreement" + 0.007*"market"'),
 (1,
  '0.049*"oil" + 0.026*"price" + 0.018*"dlrs" + 0.017*"crude" + 0.017*"opec" + 0.016*"mln" + 0.015*"bpd" + 0.014*"barrel" + 0.011*"company" + 0.010*"pct"'),
 (2,
  '0.032*"bank" + 0.026*"billion" + 0.019*"mln" + 0.017*"market" + 0.015*"dollar" + 0.014*"rate" + 0.013*"currency" + 0.013*"exchange" + 0.011*"pct" + 0.010*"stg"')]

In [13]:
# You can also request to see more words per topic
lda.show_topics(num_words=20)

[(0,
  '0.033*"trade" + 0.014*"japan" + 0.014*"would" + 0.010*"country" + 0.010*"japanese" + 0.008*"year" + 0.008*"state" + 0.007*"official" + 0.007*"agreement" + 0.007*"market" + 0.007*"world" + 0.006*"foreign" + 0.006*"united" + 0.006*"minister" + 0.006*"new" + 0.006*"last" + 0.005*"told" + 0.005*"import" + 0.005*"industry" + 0.005*"bill"'),
 (1,
  '0.049*"oil" + 0.026*"price" + 0.018*"dlrs" + 0.017*"crude" + 0.017*"opec" + 0.016*"mln" + 0.015*"bpd" + 0.014*"barrel" + 0.011*"company" + 0.010*"pct" + 0.008*"production" + 0.008*"last" + 0.007*"output" + 0.007*"petroleum" + 0.007*"ecuador" + 0.006*"day" + 0.006*"saudi" + 0.006*"would" + 0.006*"year" + 0.006*"per"'),
 (2,
  '0.032*"bank" + 0.026*"billion" + 0.019*"mln" + 0.017*"market" + 0.015*"dollar" + 0.014*"rate" + 0.013*"currency" + 0.013*"exchange" + 0.011*"pct" + 0.010*"stg" + 0.010*"dlrs" + 0.008*"money" + 0.008*"central" + 0.008*"deficit" + 0.008*"paris" + 0.007*"system" + 0.007*"monetary" + 0.007*"year" + 0.006*"treasury" + 0.0

In [14]:
# Get the topic distribution of documents
dtopics_train = lda.get_document_topics(dtm_train)

In [15]:
# Get the topic likelihood for the first document in train set
for i in range(0, 5):
    print(dtopics_train[i])

[(0, 0.99748653)]
[(0, 0.09187004), (2, 0.9053942)]
[(0, 0.7505174), (1, 0.11056269), (2, 0.13891992)]
[(0, 0.96214074), (1, 0.016980482), (2, 0.020878794)]
[(1, 0.9644994), (2, 0.03264432)]


In [16]:
# Pick the topic with the highest probability for each document, map it to the label
# NOTE: the mapping may change in a different run
from operator import itemgetter
top_train = [ max(t, key=itemgetter(1))[0] for t in dtopics_train ]
dict = {1: 'crude', 2: 'money-fx', 0: 'trade'}
topic_train = [ dict[t] for t in top_train ]

In [17]:
# Now let's see how well these topics match the actual categories
import numpy as np
from sklearn import metrics
print(metrics.confusion_matrix(topic_train, y_train))
print(np.mean(topic_train == y_train) )
print(metrics.classification_report(topic_train, y_train))

[[153   1   1]
 [  3 107  33]
 [ 18  18 141]]
0.8442105263157895
             precision    recall  f1-score   support

      crude       0.88      0.99      0.93       155
   money-fx       0.85      0.75      0.80       143
      trade       0.81      0.80      0.80       177

avg / total       0.84      0.84      0.84       475



In [18]:
# The typical practice is to use the reserved test set for evaluation
toks_test = X_test.apply(pre_process)
dtm_test = [dictionary.doc2bow(d) for d in toks_test ]
dtopics_test = lda.get_document_topics(dtm_test)
top_test = [ max(t,key=itemgetter(1))[0] for t in dtopics_test ]
topic_test = [ dict[t] for t in top_test ]
print(metrics.confusion_matrix(topic_test, y_test))
print(np.mean(topic_test == y_test) )
print(metrics.classification_report(topic_test, y_test))

[[62  1  0]
 [ 4 66 16]
 [13 13 60]]
0.8
             precision    recall  f1-score   support

      crude       0.78      0.98      0.87        63
   money-fx       0.82      0.77      0.80        86
      trade       0.79      0.70      0.74        86

avg / total       0.80      0.80      0.80       235

