### Exercise 6 - Topic models

##### Exercise 6.1

In [92]:
import os
import nltk
import numpy as np
import scipy
from numpy import matlib
import sklearn
import sklearn.decomposition
import matplotlib.pyplot as plt
import sklearn.manifold
import scipy.stats

##### 6.1a

In [93]:
#%% Getting a list of directory contents
def gettextlist(directory_path):
    directory_textfiles=[]
    directory_nontextfiles=[]
    directory_nonfiles=[]
    # Process each item in the directory
    directory_contents=os.listdir(directory_path)
    for contentitem in directory_contents:
        temp_fullpath=os.path.join(directory_path, contentitem)
        # Non-files (e.g. subdirectories) are stored separately
        if os.path.isfile(temp_fullpath)==0:
            directory_nonfiles.append(contentitem)
        else:
            # Is this a non-text file (not ending in .txt)?
            if temp_fullpath.find('.txt')==-1:
                directory_nontextfiles.append(contentitem)
            else:
                # This is a text file
                directory_textfiles.append(contentitem)
    return(directory_textfiles,directory_nontextfiles,directory_nonfiles)

In [94]:
#%% Basic file crawler
def basicfilecrawler(directory_path):
    # Store filenames read and their text content
    num_files_read=0
    crawled_filenames=[]
    mycrawled_texts=[]
    directory_contentlists=gettextlist(directory_path)
    # In this basic crawled we just process text files
    # and do not handle subdirectories
    #directory_textfiles=directory_contentlists[0]
    directory_nontextfiles=directory_contentlists[1]
    for contentitem in directory_nontextfiles:
        #print('Reading file:')
        #print(contentitem)
        # Open the file and read its contents
        temp_fullpath=os.path.join(directory_path, contentitem)
        #temp_file=open(temp_fullpath,'r',encoding='utf-8',errors='ignore')
        temp_file=open(temp_fullpath,'r')
        temp_text=temp_file.read()
        temp_file.close()
        # Store the read filename and content
        crawled_filenames.append(contentitem)
        mycrawled_texts.append(temp_text)
        num_files_read=num_files_read+1
    return(crawled_filenames, mycrawled_texts)


In [95]:
crawled_filenames, mycrawled_texts = basicfilecrawler('20news')

In [96]:
len(mycrawled_texts)

4000

In [97]:
crawled_filenames[0:5]

['100521', '101551', '101552', '101553', '101554']

##### 6.1b

In [None]:
# Exclude header lines from each message
excludedlinemarkers=['Xref:','Path:','From:','Newsgroups:','Subject:','Summary:',
                     'Keywords:','Message-ID:','Date:','Expires:','Followup-To:','Distribution:', 
                     'Organization:','Approved:','Supersedes:','Lines:','NNTP-Posting-Host:', 
                     'References:','Sender:','In-Reply-To:','Article-I.D.:','Reply-To:', 
                     'Nntp-Posting-Host:', 'Newsgroup', 'document_id']

for k in range(len(mycrawled_texts)):
    print(k)
    templines = mycrawled_texts[k].splitlines()
    remaininglines = []
    for l in range(len(templines)):
        line_should_be_excluded = 0
        for m in range(len(excludedlinemarkers)):
            if len(templines[l]) >= len(excludedlinemarkers[m]):
                if excludedlinemarkers[m] == templines[l][0:len(excludedlinemarkers[m])]:
                    line_should_be_excluded = 1
                    break
        if line_should_be_excluded == 0:
            remaininglines.append(templines[l])
    mycrawled_texts[k] = '\n'.join(remaininglines)

In [99]:
downloaded_texts = []
# remove leading and trailing whitespaces
for k in range(len(mycrawled_texts)):
    temp_texts = mycrawled_texts[k].strip()
    temp_texts = ' '.join(temp_texts.split())
    downloaded_texts.append(temp_texts)

In [101]:
len(downloaded_texts)

4000

In [102]:
# Tokenize downloaded texts and change them to NLTK format
mydownloaded_nltk_texts = []
for k in range(len(downloaded_texts)):
    temp_tokenizedtext = nltk.word_tokenize(downloaded_texts[k])
    temp_nltktext = nltk.Text(temp_tokenizedtext)
    mydownloaded_nltk_texts.append(temp_nltktext)

In [103]:
mydownloaded_nltk_texts

[<Text: The Orioles ' pitching staff again is having...>,
 <Text: In article < C4vIr5.L3r @ shuksan.ds.boeing.com > ,...>,
 <Text: In article < 1993Apr5.135153.11132 @ wdl.loral.com > gwm...>,
 <Text: THANKS TO ALL OF YOU WHO RESPONDED TO...>,
 <Text: The subject says it all . My 1984...>,
 <Text: wolfson @ regatta.sps.mot.com ( Stephen Wolfson ) writes...>,
 <Text: : TIN [ version 1.1 PL8.8>,
 <Text: I 'm looking for a replacement radio/tape player...>,
 <Text: I am thinking about getting an Infiniti G20...>,
 <Text: In article < 1993Apr5.175719.7892 @ telxon.mis.telxon.com > joes...>,
 <Text: In article < 1993Apr5.212645.15988 @ nntpd.lkg.dec.com > kenyon...>,
 <Text: In article < 3mwF2B1w165w @ njcc.wisdom.bubble.org > ,...>,
 <Text: Do n't have a list of what 's...>,
 <Text: In article < 1993Apr3.005245.10615 @ michael.apple.com > ems...>,
 <Text: In article < 1pq29p $ 29p @ seven-up.East.Sun.COM...>,
 <Text: News-Software : VAX/VMS VNEWS 1.41 In article <...>,
 <Text: This past wi

In [104]:
# Make all downloaded texts lowercase
mydownloaded_lowercase_texts = []
for k in range(len(mydownloaded_nltk_texts)):
    temp_lowercase_text = []
    for l in range(len(mydownloaded_nltk_texts[k])):
        lowercase_word = mydownloaded_nltk_texts[k][l].lower()
        temp_lowercase_text.append(lowercase_word)
    temp_lowercasetest = nltk.Text(temp_lowercase_text)
    mydownloaded_lowercase_texts.append(temp_lowercase_text)

In [105]:
len(mydownloaded_lowercase_texts)

4000

In [106]:
# Convert a POS tag for WordNet
def tagtowordnet(postag):
    wordnettag=-1
    if postag[0]=='N':
        wordnettag='n'
    elif postag[0]=='V':
        wordnettag='v'
    elif postag[0]=='J':
        wordnettag='a'
    elif postag[0]=='R':
        wordnettag='r'
    return(wordnettag)

In [107]:
# POS tag and lemmatize the loaded texts
# Download tagger and wordnet resources if you do not have them already
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# lemmatize downloaded texts
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatizetext(nltktexttolemmatize):
    # tag the text with POS tags
    taggedtext = nltk.pos_tag(nltktexttolemmatize)
    # lemmatize each word text
    lemmatizedtext = []
    for l in range(len(taggedtext)):
        # Lemmatize a word using the WordNet converted POS tag
        wordtolemmatize = taggedtext[l][0]
        wordnettag = tagtowordnet(taggedtext[l][1])
        if wordnettag != -1:
            lemmatizedword = lemmatizer.lemmatize(wordtolemmatize, wordnettag)
        else:
            lemmatizedword = wordtolemmatize
        # store the lemmatized word
        lemmatizedtext.append(lemmatizedword)
    return(lemmatizedtext)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tunde\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tunde\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [108]:
mydownloaded_lemmatizedtexts = []
for k in range(len(mydownloaded_lowercase_texts)):
    lemmatizedtext = lemmatizetext(mydownloaded_lowercase_texts[k])
    lemmatizedtext = nltk.Text(lemmatizedtext)
    mydownloaded_lemmatizedtexts.append(lemmatizedtext)

In [109]:
mydownloaded_lemmatizedtexts[0:5]

[<Text: the oriole ' pitch staff again be have...>,
 <Text: in article < c4vir5.l3r @ shuksan.ds.boeing.com > ,...>,
 <Text: in article < 1993apr5.135153.11132 @ wdl.loral.com > gwm...>,
 <Text: thanks to all of you who respond to...>,
 <Text: the subject say it all . my 1984...>]

In [110]:
# find the vocabularies of each document
myvocabularies = []
myindices_in_vocabularies = []
for k in range(len(mydownloaded_lemmatizedtexts)):
    # get unique words and where they occur
    temptext = mydownloaded_lemmatizedtexts[k]
    unique_results = np.unique(temptext, return_inverse = True)
    unique_words = unique_results[0]
    word_indices = unique_results[1]
    # store the vocabularies and the indices
    myvocabularies.append(unique_words)
    myindices_in_vocabularies.append(word_indices)

In [111]:
len(myvocabularies)

4000

In [112]:
# unify the vocabularies
# first concatenate all vocabularies
tempvocabulary = []
for k in range(len(mydownloaded_lemmatizedtexts)):
    tempvocabulary.extend(myvocabularies[k])

# find unique words among all the vocabularies
uniqueresults = np.unique(tempvocabulary, return_inverse = True)
unifiedvocabulary = uniqueresults[0]
wordindices = uniqueresults[1]

In [113]:
len(unifiedvocabulary)

42763

In [114]:
# translate previous indices to the unified vocabulary
# must keep track where each vocabulary started in the concatenated one
vocabularystart = 0
myindices_in_unifiedvocabulary = []
for k in range(len(mydownloaded_lemmatizedtexts)):
    # in order to shift word indices, we must temporarily
    # convert their data type to a numpy array
    tempindices = np.array(myindices_in_vocabularies[k])
    tempindices = tempindices + vocabularystart
    tempindices = wordindices[tempindices]
    myindices_in_unifiedvocabulary.append(tempindices)
    vocabularystart = vocabularystart + len(myvocabularies[k])

In [115]:
len(unifiedvocabulary)

42763

In [116]:
len(myindices_in_unifiedvocabulary)

4000

In [117]:
myindices_in_unifiedvocabulary[1][0:10]

array([24498, 12907, 10996, 15189, 11089, 36258, 11087,   947, 21631,
       11089], dtype=int64)

In [118]:
len(unifiedvocabulary[myindices_in_unifiedvocabulary[1]])

560

In [119]:
len(mydownloaded_lemmatizedtexts[1])

560

In [120]:
# total count of each unique word over all the downloaded documents
unifiedvocabulary_totaloccurrencecounts = np.zeros((len(unifiedvocabulary),1))

# count occurrences
for k in range(len(mydownloaded_lemmatizedtexts)):
    occurrencecounts = np.zeros((len(unifiedvocabulary), 1))
    for l in range(len(myindices_in_unifiedvocabulary[k])):
        occurrencecounts[myindices_in_unifiedvocabulary[k][l]] = (occurrencecounts[myindices_in_unifiedvocabulary[k][l]]
                                                                 + 1)
    unifiedvocabulary_totaloccurrencecounts = unifiedvocabulary_totaloccurrencecounts + occurrencecounts

In [121]:
unifiedvocabulary_totaloccurrencecounts

array([[7.079e+03],
       [2.518e+03],
       [1.224e+03],
       ...,
       [2.000e+00],
       [6.000e+00],
       [2.000e+00]])

In [122]:
len(unifiedvocabulary_totaloccurrencecounts), len(unifiedvocabulary)

(42763, 42763)

In [123]:
# top-100 words according to the largest total occurrence count
highest_totaloccurrences_indices = np.argsort(-1*unifiedvocabulary_totaloccurrencecounts, axis=0)
print(np.squeeze(unifiedvocabulary[highest_totaloccurrences_indices[0:100]]))
print(np.squeeze(unifiedvocabulary_totaloccurrencecounts[highest_totaloccurrences_indices[0:100]]))

['>' 'the' ',' '.' '--' 'be' 'a' 'to' 'i' ')' 'in' 'and' '(' 'of' ':' '@'
 'have' 'that' 'it' 'you' '!' 'do' 'for' '?' "'s" 'on' '|' "n't" '-'
 'this' 'with' 'but' 'he' "''" 'not' '0' 'they' '...' '1' 'as' '<' '``'
 'at' 'if' 'get' 'write' 'or' 'my' '2' 'article' 'game' 'go' 'would' '#'
 'can' 'about' 'one' 'will' 'all' 'there' 'an' 'what' 'from' 'out' 'so'
 '*' 'good' 'car' 'like' 'year' 'by' 'think' 'me' 'team' 'up' 'his' '%'
 'just' 'when' '3' 'make' 'more' 'no' 'your' 'know' 'any' 'say' 'who'
 'than' 'we' '4' 'time' 'some' 'play' 'well' 'see' 'how' 'player' "'m"
 'only']
[40640. 40004. 39497. 39447. 35859. 27283. 17906. 17356. 16024. 14883.
 14093. 14011. 13692. 13179. 11444.  9901.  9497.  9222.  8767.  7162.
  7079.  7035.  7012.  6686.  6181.  6043.  5925.  5222.  4878.  4748.
  4509.  4285.  4178.  4150.  4000.  3977.  3935.  3868.  3795.  3751.
  3648.  3629.  3567.  3525.  3266.  3261.  3238.  3217.  2837.  2745.
  2726.  2692.  2684.  2518.  2408.  2392.  2371.  2306.  2274.

In [124]:
#%% Vocabulary pruning
nltkstopwords = nltk.corpus.stopwords.words('english')
pruningdecisions = np.zeros((len(unifiedvocabulary),1))
for k in range(len(unifiedvocabulary)):
    # Rule 1: check the nltk stop word list
    if (unifiedvocabulary[k] in nltkstopwords):
        pruningdecisions[k] = 1
    # Rule 2: if the word is too short
    if (len(unifiedvocabulary[k]) < 3):
        pruningdecisions[k] = 1
      # Rule 3: if the word is too long
    if (len(unifiedvocabulary[k]) > 20):
        pruningdecisions[k] = 1
    # Rule 4: if the word is in the top 1% of frequent words
    if (k in highest_totaloccurrences_indices[0:int(np.floor(len(unifiedvocabulary)*0.01))]):
        pruningdecisions[k] = 1
    # Rule 5: if the word occurs less than 4 times
    if(unifiedvocabulary_totaloccurrencecounts[k] < 4):
        pruningdecisions[k] = 1

In [125]:
print('Top-100 words after pruning the unified vocabulary:\n')
remaining_indices = np.squeeze(np.where(pruningdecisions==0)[0])
remaining_vocabulary = unifiedvocabulary[remaining_indices]
remainingvocabulary_totaloccurrencecounts = unifiedvocabulary_totaloccurrencecounts[remaining_indices]
remaining_highest_totaloccurrences_indices = np.argsort(-1*remainingvocabulary_totaloccurrencecounts, axis=0)
print(np.squeeze(remaining_vocabulary[remaining_highest_totaloccurrences_indices[0:100]]))
print(np.squeeze(remainingvocabulary_totaloccurrencecounts[remaining_highest_totaloccurrences_indices[0:100]]))

Top-100 words after pruning the unified vocabulary:

['ago' 'cost' 'large' 'city' 'mind' 'total' 'mention' 'brave' 'man' 'ford'
 'penalty' 'final' 'anyway' 'bill' 'coach' 'idea' 'saw' 'performance'
 'beat' 'version' 'rule' 'montreal' 'hitter' 'seat' 'group' 'friend' 'ice'
 'today' 'face' 'although' 'order' 'almost' 'comment' 'minute' 'stats'
 'bos' 'hold' 'det' 'follow' 'job' 'american' 'insurance' 'month' 'smith'
 'past' 'local' 'easy' 'honda' 'tie' 'cal' 'news' 'hell' 'x-newsreader'
 'wait' 'ticket' 'bring' 'jet' 'walk' 'hope' 'helmet' 'rider' 'joe'
 'stuff' 'morris' 'van' 'left' 'compare' 'note' 'word' 'york' 'experience'
 'center' 'flyer' 'information' 'add' 'break' 'defense' 'puck' 'set' 'pen'
 'rear' 'contact' 'netcom.com' 'others' 'e-mail' 'later' 'young' 'claim'
 'late' 'whether' 'tor' 'design' 'matter' 'pit' 'life' 'pull' 'decide'
 'instead' 'open' 'difference']
[247. 246. 246. 246. 245. 244. 244. 243. 242. 241. 240. 240. 236. 232.
 231. 230. 229. 229. 228. 225. 223. 223. 223.

In [126]:
#%% Get indices of documents to remaining words
oldtopruned=[]
tempind=-1
for k in range(len(unifiedvocabulary)):
    if pruningdecisions[k]==0:
        tempind=tempind+1
        oldtopruned.append(tempind)
    else:
        oldtopruned.append(-1)

In [127]:
#%% Create pruned texts
mycrawled_prunedtexts=[]
myindices_in_prunedvocabulary=[]
for k in range(len(mydownloaded_lemmatizedtexts)):
    #print(k)
    temp_newindices=[]
    temp_newdoc=[]
    for l in range(len(mydownloaded_lemmatizedtexts[k])):
        temp_oldindex=myindices_in_unifiedvocabulary[k][l]
        temp_newindex=oldtopruned[temp_oldindex]
        if temp_newindex!=-1:
            temp_newindices.append(temp_newindex)
            temp_newdoc.append(unifiedvocabulary[temp_oldindex])
    mycrawled_prunedtexts.append(temp_newdoc)
    myindices_in_prunedvocabulary.append(temp_newindices)


In [128]:
len(mycrawled_prunedtexts[1])

85

In [129]:
len(remaining_vocabulary)

12087

##### 6.1c

In [39]:
# create count vectors
n_docs = len(mycrawled_prunedtexts) # 4000=total number of documents
n_vocab = len(remaining_vocabulary) # 12087
# matrix of term frequencies
tfmatrix = scipy.sparse.lil_matrix((n_docs, n_vocab)) # 4000x12087
# row vector of document frequencies
dfvector = scipy.sparse.lil_matrix((1, n_vocab)) #1x12087

In [40]:
n_docs, n_vocab

(4000, 12087)

In [41]:
tfmatrix.shape, dfvector.shape

((4000, 12087), (1, 12087))

In [42]:
# loop over the documents
for k in range(n_docs):
    # row vector of which words occur in this document
    temp_dfvector = scipy.sparse.lil_matrix((1, n_vocab)) # #1x12087
    # we loop over the words in each document
    for l in range(len(mycrawled_prunedtexts[k])):
        # add current word to term-frequency count and document count
        currentword = myindices_in_prunedvocabulary[k][l]
        tfmatrix[k, currentword] = tfmatrix[k, currentword] + 1
        #tfmatrix[k, currentword] = tfmatrix[k, currentword]/len(mycrawled_prunedtexts[k])
        temp_dfvector[0, currentword] = 1
    # add which words occur in this document to overall document counts
    
    dfvector = dfvector + temp_dfvector

In [43]:
# length-normalized frequency for the TF part
for i in range(n_docs):
    for j in range(len(tfmatrix.data[i])):
        tfmatrix.data[i][j] = tfmatrix.data[i][j] / len(mycrawled_prunedtexts[i])

In [44]:
sum(tfmatrix.data[0]), sum(tfmatrix.data[1]) # sum to 1 because of the normalization

(1.0000000000000009, 0.9999999999999999)

In [45]:
sum(tfmatrix.data[2]), sum(tfmatrix.data[3])

(0.9999999999999994, 1.0000000000000002)

In [46]:
# create IDF and TF-IDF vectors
# smoothed logarithmic idf
idfvector = np.squeeze(np.array(dfvector.todense()))
idfvector = 1 + np.log(((idfvector + 1) ** -1) * n_docs)

In [47]:
idfvector

array([6.89615437, 7.68461173, 6.89615437, ..., 7.50229017, 6.99146455,
       7.34813949])

In [48]:
# use the count statistics to compute the tf-idf matrix
tfidfmatrix = scipy.sparse.lil_matrix((n_docs, n_vocab))
# use the count statistics to compute the tf-idf matrix
for k in range(n_docs):
    # find nonzero term frequencies
    tempindices = np.nonzero(tfmatrix[k, :])[1]
    tfterm = np.squeeze(np.array(tfmatrix[k, tempindices].todense()))
    tfidfmatrix[k, tempindices] = tfterm * idfvector[tempindices]

In [49]:
tfidfmatrix.shape

(4000, 12087)

##### 6.1d

In [50]:
# Compute 10 factors from LSA
n_low_dimensions = 10
Uleft, D, UrightT = scipy.sparse.linalg.svds(tfidfmatrix.todense(), k=n_low_dimensions)

In [51]:
# Examine the singular values
print(D)

[4.94213356 4.97446492 5.07663717 5.11207295 5.22913741 5.62844437
 5.93986337 6.73541547 7.36648552 7.78843854]


In [52]:
UrightT.shape 

(10, 12087)

In [53]:
print(UrightT) # 10 factors

[[-7.48652045e-05 -4.38742249e-05 -7.48652045e-05 ... -7.88168035e-05
   5.36025794e-04 -1.55600569e-04]
 [-5.34351494e-04 -3.38914712e-04 -5.34351494e-04 ... -7.30798271e-04
   3.80269118e-03 -8.52743341e-04]
 [ 8.93116267e-05  2.60207810e-05  8.93116267e-05 ...  7.54251556e-05
   2.12102348e-04  9.34886466e-05]
 ...
 [-7.34234699e-05 -3.95872799e-05 -7.34234699e-05 ... -8.23423235e-05
  -1.12980412e-04 -1.28019655e-04]
 [-3.63891590e-06 -1.67643667e-06 -3.63891590e-06 ... -2.45962949e-06
  -4.46098184e-06 -1.99512910e-06]
 [-5.13664759e-05 -1.81954707e-05 -5.13664759e-05 ... -6.57241349e-05
  -6.79921477e-05 -2.73291720e-05]]


In [54]:
# Examine a factor (here the one with largest singular value)
print(UrightT[9, :])

[-5.13664759e-05 -1.81954707e-05 -5.13664759e-05 ... -6.57241349e-05
 -6.79921477e-05 -2.73291720e-05]


##### 6.1e

In [55]:
# 10 words with largest absolute weights in each factor
for i in range(UrightT.shape[0]):
    print('10 words with largest absolute weights in factor ' + str(i+1))
    print('-------------------------------------------------------------')
    topweights_indices=np.argsort(-1*np.abs(UrightT[i,:]))
    print(remaining_vocabulary[topweights_indices[0:10]])
    print('\n')

10 words with largest absolute weights in factor 1
-------------------------------------------------------------
['osf.org' 'research.nj.nec.com' 'tin' 'x-newsreader' '1.1' 'group'
 'version' 'appreciate' 'pl8' 'honda']


10 words with largest absolute weights in factor 2
-------------------------------------------------------------
['tin' 'x-newsreader' '1.1' 'version' 'pl8' 'group' 'appreciate' 'jewish'
 'honda' 'pl4']


10 words with largest absolute weights in factor 3
-------------------------------------------------------------
['altima' 'generic' 'jewish' 'kingman' 'appreciate' 'stanza' 'nissan'
 'group' 'paperwork' 'nameplate']


10 words with largest absolute weights in factor 4
-------------------------------------------------------------
['jewish' 'kingman' 'tin' 'x-newsreader' '1.1' 'lafibm.lafayette.edu'
 'vb30' 'version' 'acad.drake.edu' 'altima']


10 words with largest absolute weights in factor 5
-------------------------------------------------------------
['appreciat

**The words in the factors seem related to the newsgroups**
- **honda, nissan and license seem related to rec.motorcycles newsgroup**
- **honda, stanza, nissan, altima, viper and lincense seem related to rec.autos newsgroup**
- **career, captain seem related to rec.sport.hockey newsgroup**
- **career, baseman, cub, yankee and captain seem related to rec.sport.baseball newsgroup**

##### 6.1f

In [56]:
# Compute 15 factors from LSA
n_low_dimensions = 15
Uleft, D, UrightT = scipy.sparse.linalg.svds(tfidfmatrix.todense(), k=n_low_dimensions)

In [57]:
# Examine the singular values
print(D) 

[4.47621528 4.61852057 4.71696298 4.75945967 4.78296889 4.94213356
 4.97446492 5.07663717 5.11207295 5.22913741 5.62844437 5.93986337
 6.73541547 7.36648552 7.78843854]


In [58]:
UrightT.shape # 15 factors

(15, 12087)

In [59]:
# 10 words with largest absolute weights in each factor
for i in range(UrightT.shape[0]):
    print('10 words with largest absolute weights in factor ' + str(i+1))
    print('-------------------------------------------------------------')
    topweights_indices=np.argsort(-1*np.abs(UrightT[i,:]))
    print(remaining_vocabulary[topweights_indices[0:10]])
    print('\n')

10 words with largest absolute weights in factor 1
-------------------------------------------------------------
['blah' 'yankee' 'eliot' 'stats' 'thank' 'arocha' 'group' 'jeffries'
 'status' 'dolven']


10 words with largest absolute weights in factor 2
-------------------------------------------------------------
['honda' 'subscribe' 'tour' 'group' 'lake' 'blah' 'forget' 'final'
 'advantage' 'stats']


10 words with largest absolute weights in factor 3
-------------------------------------------------------------
['group' 'lake' 'final' 'maine' 'advantage' '5-4' 'blah' 'lssu' 'terry'
 'appreciate']


10 words with largest absolute weights in factor 4
-------------------------------------------------------------
['advantage' 'lake' 'final' 'group' 'maine' '5-4' 'lssu' 'blah' 'terry'
 'appreciate']


10 words with largest absolute weights in factor 5
-------------------------------------------------------------
['group' 'advantage' 'lake' 'final' 'blah' 'honda' 'maine' 'subscribe'
 '5-

- **The last 10 factors produce the same words as the 10 factors in (e)**
- **The new 5 factors seem less meaningful as they have fewer words that seem related to the newsgroups**


##### Exercise 6.2

##### 6.2a

In [130]:
# create count vectors
n_docs = len(mycrawled_prunedtexts) # 4000=total number of documents
n_vocab = len(remaining_vocabulary) # 12087
# matrix of term frequencies
tfmatrix = scipy.sparse.lil_matrix((n_docs, n_vocab)) # 4000x12087
# row vector of document frequencies
dfvector = scipy.sparse.lil_matrix((1, n_vocab)) #1x12087

In [131]:
n_docs, n_vocab

(4000, 12087)

In [132]:
tfmatrix.shape, dfvector.shape

((4000, 12087), (1, 12087))

In [133]:
# loop over the documents
for k in range(n_docs):
    # row vector of which words occur in this document
    temp_dfvector = scipy.sparse.lil_matrix((1, n_vocab)) # #1x12087
    # we loop over the words in each document
    for l in range(len(mycrawled_prunedtexts[k])):
        # add current word to term-frequency count and document count
        currentword = myindices_in_prunedvocabulary[k][l]
        tfmatrix[k, currentword] = tfmatrix[k, currentword] + 1
        #tfmatrix[k, currentword] = tfmatrix[k, currentword]/len(mycrawled_prunedtexts[k])
        temp_dfvector[0, currentword] = 1
    # add which words occur in this document to overall document counts
    
    dfvector = dfvector + temp_dfvector

In [134]:
tfmatrix.shape

(4000, 12087)

##### 6.2b

In [135]:
def plsa(document_to_word_matrix, n_topics, n_iterations):
    n_docs=np.shape(document_to_word_matrix)[0] # Number of documents 
    n_vocab=np.shape(document_to_word_matrix)[1] # Number of vocabulary words
    theta = scipy.stats.uniform.rvs(size=(n_vocab, n_topics)) # Prob of words per topic: random init
    theta = theta/np.matlib.repmat(np.sum(theta,axis=0),n_vocab,1)
    psi = scipy.stats.uniform.rvs(size=(n_topics,n_docs)) # Probs topics per document: random init
    psi = psi/np.matlib.repmat(np.sum(psi,axis=0),n_topics,1)
    n_words_in_docs = np.squeeze(np.array(np.sum(document_to_word_matrix,axis=1))) # Numbers of words in documents: 
                                                                                   # computed once
    n_totalwords = np.sum(n_words_in_docs) # Total number of words: computed once
    pi = n_words_in_docs/n_totalwords # Document probs: computed once
    for myiter in range(n_iterations): # Perform Expectation-Maximization iterations
        # ===Perform E-step====
        doc_word_to_topics = [] # Compute theta_{v|t}psi_{t|d}/sum_t' theta_{v|t'}psi_{t'|d}
        doc_word_to_topic_sum = np.zeros((n_docs,n_vocab))
        for t in range(n_topics):
            doc_word_to_topict = np.matlib.repmat(theta[:,t], n_docs,1) * np.matlib.repmat(psi[t,:],n_vocab,1).T
            myepsilon=1e-14 # Add a positive number to avoid divisions by zero
            doc_word_to_topict += myepsilon
            doc_word_to_topics.append(doc_word_to_topict)
            doc_word_to_topic_sum += doc_word_to_topict
    for t in range(n_topics):
        doc_word_to_topics[t] /= doc_word_to_topic_sum
    # =======Perform M-step=======
    # Add a small number to word counts to avoid divisions by zero
    for t in range(n_topics): # Compute document-to-topic probabilities.
        psi[t,:] = np.squeeze(np.array(np.sum(np.multiply(
            document_to_word_matrix + myepsilon, doc_word_to_topics[t]),axis=1)))
    psi /= np.matlib.repmat(np.sum(psi,axis=0),n_topics,1)
    for t in range(n_topics): # Compute topic-to-word probabilities
        theta[:,t]= np.squeeze(np.array(np.sum(np.multiply(
            document_to_word_matrix,doc_word_to_topics[t]),axis=0).T))
    theta /= np.matlib.repmat(np.sum(theta,axis=0),n_vocab,1)
    return(pi,psi,theta)

In [136]:
# Run PLSA
n_topics = 10
n_iterations = 200
pi, psi, theta = plsa(tfmatrix.todense(), n_topics, n_iterations)

In [137]:
# Examine the factor probabilities p(t) = sum_d p(t|d) p(d)
print(np.sum(psi * np.matlib.repmat(pi, n_topics,1), axis=1))

[0.1022757  0.10237872 0.10360222 0.09754084 0.09814357 0.09818026
 0.09904054 0.10307408 0.09505071 0.10071336]


In [138]:
# Examine a factor (here the one with largest probability, factor 7)
print(theta[:, 7])

[4.54652329e-05 1.90239634e-05 4.62941613e-05 ... 1.16889231e-05
 6.51174676e-05 2.45828384e-05]


In [139]:
theta[:, 7].shape

(12087,)

In [141]:
psi[7,:].shape

(4000,)

In [142]:
np.matlib.repmat(pi, n_topics,1).shape

(10, 4000)

In [143]:
theta.shape

(12087, 10)

##### 6.2c

In [145]:
# 10 words with largest absolute weights in each factor
for i in range(theta.shape[1]):
    print('10 words with largest absolute weights in factor ' + str(i+1))
    print('-------------------------------------------------------------')
    topweights_indices = np.argsort(-1 * np.abs(theta[:, i]))
    print(remaining_vocabulary[topweights_indices[0:10]])
    print('\n')

10 words with largest absolute weights in factor 1
-------------------------------------------------------------
['ago' 'cost' 'final' 'det' 'hold' 'york' 'phone' 'beat' 'e-mail'
 'mention']


10 words with largest absolute weights in factor 2
-------------------------------------------------------------
['buffalo' 'york' 'job' 'allow' 'mind' 'bos' 'mention' 'pull' 'wait'
 'final']


10 words with largest absolute weights in factor 3
-------------------------------------------------------------
['later' 'group' 'forget' 'saw' 'mention' 'hope' 'month' 'cal'
 'disclaimer' 'late']


10 words with largest absolute weights in factor 4
-------------------------------------------------------------
['young' 'ice' 'almost' 'past' 'cost' 'local' 'insurance' 'bill' 'mind'
 'man']


10 words with largest absolute weights in factor 5
-------------------------------------------------------------
['large' 'beat' 'compare' 'friend' 'order' 'later' 'ago' 'forget' 'set'
 'tie']


10 words with largest a

**Some words in the factors seem related to the newsgroups**
- **final, beat, york, pull, ice, performance, penalty seem related to rec.sport.baseball and rec.sport.hockey newsgroups**
- **insurance, vehicle, ford, honda, rider seem related to rec.autos and rec.motorcycles newsgroups**

##### 6.2d

In [146]:
psi.shape

(10, 4000)

In [208]:
# 10 words with largest absolute weights in each factor
for i in range(psi.shape[0]):
    topweights_indices = np.argsort(-1 * np.abs(psi[i, :]))
    print('Document index with highest factor probability: ' + str(topweights_indices[0] + 1))
    print('-----------------------------------------------------------------------')
    print('First 100 words of factor ' + str(i+1) + ' of the document with highest probability')
    print('-----------------------------------------------------------------------')
    print(remaining_vocabulary[topweights_indices[0:100]])
    print('\n')

Document index with highest factor probability: 516
-----------------------------------------------------------------------
First 100 words of factor 1 of the document with highest probability
-----------------------------------------------------------------------
['0.565' '.20' '412' 'convenient' '*chain*' '6-4' 'ciccerelli' '02-00'
 'closer' "'62" '3com' 'cbs' '415' '1960' '1qkkodinn5f5' '0.62' 'buf/sd'
 '571' 'advice' 'continually' 'ash' 'author' '0.00' '-r-i-d-e-r'
 'compulsive' 'citadel' 'charles.a.rogers' 'blvd' '3732' '5-1' '175'
 '.signature' '.294' '984' 'buzz' '139' 'cod' '240+' '*young*' 'canucks'
 'barry' 'baumgartner' '20-' 'cabletron' '__/' 'bigot' '300' 'aptitude'
 'core' 'battle' '0.879' 'adolf' 'british' '227' '.405' '0.74' 'accident'
 '11-12' '36.4' '10.00' '0111' 'bielecki' 'advance' '.427' 'acquisition'
 'berube' '0.26' 'assure' '0.19' '1000km' '.274' 'category' '0.889'
 'cleaner-burning' 'brock' '1040' '4-6' 'bullet' 'bowman' '85.0' '328'
 '0.43' 'blumstein' 'bein'

***6.2e***

**Many words in each factor seem not to be meaningful and related to the newsgroups.**

#### Exercise 6.3

##### 6.3f

In [230]:
import gensim
gensim_docs = mycrawled_prunedtexts
gensim_dictionary = gensim.corpora.Dictionary(gensim_docs)
# Create the document-term vectors
gensim_docvectors = [gensim_dictionary.doc2bow(doc) for doc in gensim_docs]

##### 6.3g

In [241]:
# Run the LDA optimization for 10 topics
numtopics=10
randomseed=124574527
numiters=10000
ninits=10
gensim_ldamodel = gensim.models.ldamodel.LdaModel(gensim_docvectors, id2word=gensim_dictionary,
                                                 num_topics=numtopics, iterations=numiters, random_state=randomseed)

##### 6.3h

In [242]:
gensim_ldamodel.print_topics(num_topics=numtopics, num_words=10)

[(0,
  '0.007*"captain" + 0.006*"gant" + 0.005*"philadelphia" + 0.005*"hirschbeck" + 0.005*"puck" + 0.004*"devil" + 0.003*"umpire" + 0.003*"hawk" + 0.003*"suck" + 0.003*"strike"'),
 (1,
  '0.006*"championship" + 0.004*"maynard" + 0.004*"arena" + 0.004*"+/-" + 0.003*"draft" + 0.003*"station" + 0.003*"winner" + 0.003*"halifax" + 0.003*"pen" + 0.003*"paul"'),
 (2,
  '0.005*"puck" + 0.005*"buffalo" + 0.003*"total" + 0.003*"van" + 0.003*"career" + 0.003*"ott" + 0.003*"bond" + 0.002*"rochester" + 0.002*"compare" + 0.002*"morris"'),
 (3,
  '0.006*"det" + 0.005*"tor" + 0.005*"pit" + 0.005*"bos" + 0.005*"montreal" + 0.004*"cal" + 0.004*"min" + 0.003*"stl" + 0.003*"minnesota" + 0.003*"lock"'),
 (4,
  '0.008*"penalty" + 0.004*"conference" + 0.004*"quebec" + 0.004*"hartford" + 0.003*"montreal" + 0.003*"___" + 0.003*"ottawa" + 0.002*"sanderson" + 0.002*"tie" + 0.002*"louis"'),
 (5,
  '0.007*"stanley" + 0.004*"final" + 0.004*"joseph" + 0.003*"pen" + 0.003*"european" + 0.003*"minute" + 0.003*"jet" + 

- ***The top words in each topic seem related to sports - baseball and hockey***

##### 6.3i

In [243]:
# Get topic content: term-topic probabilities
gensim_termtopicprobabilities = gensim_ldamodel.get_topics()
# Get topic prevalences per document, and overall topic prevalences
# (expected amount of documents per topic)
overallstrengths = np.zeros((numtopics, 1))
documentstrengths = np.zeros((len(gensim_docvectors), numtopics)) # 4000x10
for k in range(len(gensim_docvectors)):
    topicstrengths = gensim_ldamodel.get_document_topics(gensim_docvectors[k], minimum_probability=0)
    for m in range(len(topicstrengths)):
        documentstrengths[k][topicstrengths[m][0]] = topicstrengths[m][1]
        overallstrengths[topicstrengths[m][0]] = overallstrengths[topicstrengths[m][0]] + topicstrengths[m][1]

In [244]:
for i in range(numtopics):
    word_list = []
    topweights_indices = np.argsort(-1 * np.abs(documentstrengths[:, i]))
    print('Document with highest topic probability: ' + str(topweights_indices[0] + 1))
    print('-----------------------------------------------------------------------')
    print('First 100 words of topic ' + str(i+1) + ' of the document with highest topic probability')
    print('-----------------------------------------------------------------------')
    words_with_probabilities = gensim_ldamodel.show_topic(i, topn=100)
    for word_with_probability in words_with_probabilities:
        word_list.append(word_with_probability)
    print(word_list)
    print('\n')

Document with highest topic probability: 1956
-----------------------------------------------------------------------
First 100 words of topic 1 of the document with highest topic probability
-----------------------------------------------------------------------
[('captain', 0.007223059), ('gant', 0.0064131706), ('philadelphia', 0.0046833158), ('hirschbeck', 0.0046017454), ('puck', 0.0045988844), ('devil', 0.0038172475), ('umpire', 0.003268002), ('hawk', 0.0031789003), ('suck', 0.003171802), ('strike', 0.0026171238), ('coverage', 0.002407789), ('hitter', 0.0023553455), ('helmet', 0.0023364287), ('order', 0.0023089394), ('stand', 0.0022562516), ('coach', 0.0022156944), ('left', 0.0021053758), ('slot', 0.002073364), ('ice', 0.0020159716), ('box', 0.0020067089), ('saw', 0.0019916652), ('tournament', 0.0019056029), ('insurance', 0.001871022), ("'91", 0.001848288), ('champ', 0.0018168534), ('ray', 0.0018124336), ('skate', 0.0018108978), ('type', 0.0018077022), ('adobe.com', 0.0017357153), 

***6.3j***

***The contents of individual topics seem meaningful and related to the newsgroups***

##### 6.3k

In [235]:
# Run the LDA optimization for 15 topics
numtopics=15
gensim_ldamodel = gensim.models.ldamodel.LdaModel(gensim_docvectors, id2word=gensim_dictionary,
                                                 num_topics=numtopics, iterations=numiters, random_state=randomseed)

In [236]:
gensim_ldamodel.print_topics(num_topics=numtopics, num_words=10)

[(0,
  '0.010*"captain" + 0.007*"hawk" + 0.005*"philadelphia" + 0.005*"suck" + 0.005*"0.00" + 0.004*"coach" + 0.003*"swedish" + 0.003*"cap" + 0.003*"former" + 0.003*"stand"'),
 (1,
  '0.008*"maynard" + 0.006*"championship" + 0.006*"scoring" + 0.005*"winner" + 0.005*"defense" + 0.005*"2nd" + 0.005*"coach" + 0.005*"draft" + 0.004*"yankee" + 0.004*"station"'),
 (2,
  '0.005*"buffalo" + 0.003*"rochester" + 0.003*"draft" + 0.003*"club" + 0.003*"cape" + 0.003*"pit" + 0.003*"inning" + 0.003*"total" + 0.003*"rule" + 0.003*"1992-93"'),
 (3,
  '0.006*"montreal" + 0.005*"roy" + 0.005*"1-0" + 0.004*"pit" + 0.004*"minnesota" + 0.004*"pen" + 0.004*"lock" + 0.004*"mattingly" + 0.003*"michael" + 0.003*"norway"'),
 (4,
  '0.006*"hartford" + 0.004*"ottawa" + 0.004*"___" + 0.003*"morris" + 0.002*"+/-" + 0.002*"nick" + 0.002*"canadian" + 0.002*"view" + 0.002*"rider" + 0.002*"mind"'),
 (5,
  '0.018*"penalty" + 0.005*"minute" + 0.004*"final" + 0.004*"=====" + 0.004*"joseph" + 0.004*"delete" + 0.003*"hull" +

***The first 10 topics are not the same as the 10 topics in (h)***
- ***There are only some words in the first 10 topics that appear in the 10 topics in (h). These words have different probabilities.***

In [237]:
# Get topic content: term-topic probabilities
gensim_termtopicprobabilities = gensim_ldamodel.get_topics()
# Get topic prevalences per document, and overall topic prevalences
# (expected amount of documents per topic)
overallstrengths = np.zeros((numtopics, 1))
documentstrengths = np.zeros((len(gensim_docvectors), numtopics)) # 4000x15
for k in range(len(gensim_docvectors)):
    topicstrengths = gensim_ldamodel.get_document_topics(gensim_docvectors[k], minimum_probability=0)
    for m in range(len(topicstrengths)):
        documentstrengths[k][topicstrengths[m][0]] = topicstrengths[m][1]
        overallstrengths[topicstrengths[m][0]] = overallstrengths[topicstrengths[m][0]] + topicstrengths[m][1]

In [238]:
documentstrengths.shape

(4000, 15)

In [240]:
for i in range(numtopics):
    word_list = []
    topweights_indices = np.argsort(-1 * np.abs(documentstrengths[:, i]))
    print('Document with highest topic probability: ' + str(topweights_indices[0] + 1))
    print('-----------------------------------------------------------------------')
    print('First 100 words of topic ' + str(i+1) + ' of the document with highest topic probability')
    print('-----------------------------------------------------------------------')
    words_with_probabilities = gensim_ldamodel.show_topic(i, topn=100)
    for word_with_probability in words_with_probabilities:
        word_list.append(word_with_probability)
    print(word_list)
    print('\n')

Document with highest topic probability: 1956
-----------------------------------------------------------------------
First 100 words of topic 1 of the document with highest topic probability
-----------------------------------------------------------------------
[('captain', 0.010319422), ('hawk', 0.0074584656), ('philadelphia', 0.005420894), ('suck', 0.004754035), ('0.00', 0.0045288955), ('coach', 0.0037435545), ('swedish', 0.0032953615), ('cap', 0.002971868), ('former', 0.0029717328), ('stand', 0.0029034365), ('face', 0.0028993487), ('champ', 0.002867784), ('devil', 0.0027952315), ('keith', 0.0027301796), ('hitter', 0.0027274876), ('pittsburg', 0.0024270937), ('green', 0.002407949), ('dean', 0.0024045585), ('hold', 0.0023849893), ('left', 0.0023147583), ('type', 0.0022901236), ('keller', 0.0022689837), ('tournament', 0.0022403714), ('ice', 0.002207734), ('insurance', 0.0021073793), ('kkeller', 0.0020703613), ('cookson', 0.0020533018), ("'91", 0.0020426563), ('minute', 0.0020410076),

***The contents of individual topics also seem meaningful and related to the newsgroups***
- **There are only some words in the first 10 topics that appear in the 10 topics in (i) but with different probabilities**

In [179]:
count = 0
for i in gensim_ldamodel[gensim_docvectors]:
    print('doc : ', count, i)
    count += 1

doc :  0 [(5, 0.98830456)]
doc :  1 [(9, 0.9895229)]
doc :  2 [(4, 0.9718132)]
doc :  3 [(0, 0.9249026)]
doc :  4 [(6, 0.5737663), (9, 0.36904272)]
doc :  5 [(5, 0.983315)]
doc :  6 [(5, 0.9653462)]
doc :  7 [(1, 0.2338177), (9, 0.7451031)]
doc :  8 [(0, 0.9762776)]
doc :  9 [(0, 0.11379922), (5, 0.8619305)]
doc :  10 [(6, 0.95708895)]
doc :  11 [(0, 0.9947614)]
doc :  12 [(3, 0.95902306)]
doc :  13 [(7, 0.93069583)]
doc :  14 [(2, 0.97349083)]
doc :  15 [(9, 0.98765194)]
doc :  16 [(9, 0.98233795)]
doc :  17 [(6, 0.98160684)]
doc :  18 [(0, 0.1173234), (3, 0.5384582), (6, 0.33018884)]
doc :  19 [(7, 0.07466916), (9, 0.91280365)]
doc :  20 [(3, 0.66835415), (4, 0.316235)]
doc :  21 [(9, 0.9718486)]
doc :  22 [(1, 0.5019001), (8, 0.4829784)]
doc :  23 [(0, 0.016679354), (1, 0.016684098), (2, 0.016678859), (3, 0.016677998), (4, 0.016680801), (5, 0.84988385), (6, 0.016678777), (7, 0.016679991), (8, 0.016678344), (9, 0.016677912)]
doc :  24 [(7, 0.7055096), (8, 0.2596766)]
doc :  25 [(7, 0

doc :  182 [(2, 0.25276855), (6, 0.72902906)]
doc :  183 [(0, 0.98040926)]
doc :  184 [(2, 0.28998083), (9, 0.69904417)]
doc :  185 [(0, 0.13705005), (2, 0.83089864)]
doc :  186 [(2, 0.26850635), (5, 0.058957055), (6, 0.65730363)]
doc :  187 [(2, 0.80422246), (3, 0.17836761)]
doc :  188 [(3, 0.100342065), (5, 0.88608426)]
doc :  189 [(1, 0.9399292)]
doc :  190 [(0, 0.93560284)]
doc :  191 [(3, 0.9355174)]
doc :  192 [(2, 0.94703543)]
doc :  193 [(0, 0.014292673), (1, 0.8713676), (2, 0.014292056), (3, 0.014292031), (4, 0.014291986), (5, 0.014292462), (6, 0.014291981), (7, 0.0142935915), (8, 0.014292867), (9, 0.014292775)]
doc :  194 [(4, 0.9803986)]
doc :  195 [(2, 0.9590734)]
doc :  196 [(6, 0.95905554)]
doc :  197 [(2, 0.20908853), (4, 0.77858496)]
doc :  198 [(1, 0.28186873), (2, 0.09672869), (4, 0.6121776)]
doc :  199 [(2, 0.31310168), (8, 0.6835747)]
doc :  200 [(2, 0.99180895)]
doc :  201 [(0, 0.54165894), (2, 0.426307)]
doc :  202 [(9, 0.99020594)]
doc :  203 [(0, 0.21022977), (6

doc :  349 [(6, 0.9718483)]
doc :  350 [(9, 0.9436689)]
doc :  351 [(5, 0.218711), (9, 0.7492545)]
doc :  352 [(0, 0.16074064), (5, 0.82872236)]
doc :  353 [(0, 0.6397436), (9, 0.33916166)]
doc :  354 [(0, 0.95707875)]
doc :  355 [(4, 0.46284983), (7, 0.5266084)]
doc :  356 [(0, 0.984187)]
doc :  357 [(5, 0.98952335)]
doc :  358 [(1, 0.9470198)]
doc :  359 [(1, 0.9399547)]
doc :  360 [(9, 0.98121375)]
doc :  361 [(0, 0.050015714), (1, 0.050009314), (2, 0.05001022), (3, 0.050014608), (4, 0.05002104), (5, 0.54987156), (6, 0.050017428), (7, 0.050013617), (8, 0.05001325), (9, 0.05001327)]
doc :  362 [(5, 0.9624544)]
doc :  363 [(2, 0.9608015)]
doc :  364 [(1, 0.12546076), (9, 0.8554644)]
doc :  365 [(0, 0.99568677)]
doc :  366 [(6, 0.99030775)]
doc :  367 [(6, 0.58138037), (8, 0.41008878)]
doc :  368 [(3, 0.98546344)]
doc :  369 [(8, 0.9470167)]
doc :  370 [(3, 0.4332526), (5, 0.5196115)]
doc :  371 [(6, 0.99785984)]
doc :  372 [(1, 0.9762974)]
doc :  373 [(1, 0.07720679), (2, 0.9022526)]


doc :  565 [(1, 0.52705866), (7, 0.46002153)]
doc :  566 [(4, 0.9749758)]
doc :  567 [(0, 0.95905226)]
doc :  568 [(4, 0.9040007), (5, 0.07895128)]
doc :  569 [(0, 0.34578145), (1, 0.12988442), (3, 0.15442888), (4, 0.35364175)]
doc :  570 [(9, 0.98781246)]
doc :  571 [(0, 0.6408236), (6, 0.33296478), (8, 0.021628242)]
doc :  572 [(5, 0.123455435), (7, 0.8569998)]
doc :  573 [(4, 0.5823448), (8, 0.1642585), (9, 0.2183644)]
doc :  574 [(0, 0.17863294), (2, 0.21066716), (7, 0.25785607), (9, 0.3305805)]
doc :  575 [(4, 0.94368917)]
doc :  576 [(6, 0.3451248), (7, 0.6388581)]
doc :  577 [(7, 0.9902089)]
doc :  578 [(6, 0.8934515), (9, 0.09913154)]
doc :  579 [(0, 0.17368911), (9, 0.8120078)]
doc :  580 [(4, 0.9921639)]
doc :  581 [(1, 0.98766387)]
doc :  582 [(6, 0.98712754)]
doc :  583 [(1, 0.7707895), (5, 0.068515345), (8, 0.15193468)]
doc :  584 [(1, 0.19699293), (2, 0.755909)]
doc :  585 [(4, 0.9608199)]
doc :  586 [(1, 0.19900398), (5, 0.060854096), (9, 0.71675897)]
doc :  587 [(3, 0.9

doc :  771 [(3, 0.19971596), (4, 0.09769519), (6, 0.24029335), (7, 0.45459378)]
doc :  772 [(4, 0.7035156), (5, 0.2905486)]
doc :  773 [(5, 0.9871286)]
doc :  774 [(9, 0.98653334)]
doc :  775 [(5, 0.4788906), (7, 0.48773548)]
doc :  776 [(0, 0.7011972), (8, 0.2679898)]
doc :  777 [(8, 0.04574805), (9, 0.9454498)]
doc :  778 [(5, 0.9624583)]
doc :  779 [(7, 0.9922327)]
doc :  780 [(3, 0.08087746), (6, 0.90243334)]
doc :  781 [(0, 0.371109), (4, 0.60386264)]
doc :  782 [(0, 0.6735145), (6, 0.30645502)]
doc :  783 [(4, 0.34311053), (7, 0.6373527)]
doc :  784 [(3, 0.06414598), (6, 0.92293286)]
doc :  785 [(2, 0.6346859), (4, 0.35476944)]
doc :  786 [(7, 0.17998214), (8, 0.8111097)]
doc :  787 [(6, 0.99296135)]
doc :  788 [(5, 0.9830003)]
doc :  789 [(2, 0.64241487), (8, 0.3353324)]
doc :  790 [(3, 0.9178289), (7, 0.072856575)]
doc :  791 [(0, 0.28143212), (8, 0.71094114)]
doc :  792 [(1, 0.1325604), (5, 0.13500132), (9, 0.7054335)]
doc :  793 [(9, 0.9876578)]
doc :  794 [(5, 0.38700348), (

doc :  955 [(5, 0.97993773)]
doc :  956 [(0, 0.6606222), (2, 0.32030606)]
doc :  957 [(0, 0.8372384), (8, 0.15193327)]
doc :  958 [(0, 0.9639546)]
doc :  959 [(0, 0.34562787), (4, 0.3125041), (7, 0.32838625)]
doc :  960 [(9, 0.9922334)]
doc :  961 [(5, 0.121656105), (9, 0.8115853)]
doc :  962 [(4, 0.4558379), (5, 0.16648546), (6, 0.043368876), (7, 0.33178204)]
doc :  963 [(9, 0.98300064)]
doc :  964 [(9, 0.97629064)]
doc :  965 [(0, 0.011127578), (1, 0.89984834), (2, 0.011127365), (3, 0.011127273), (4, 0.011129544), (5, 0.011128137), (6, 0.0111276675), (7, 0.0111281), (8, 0.01112794), (9, 0.011128049)]
doc :  966 [(0, 0.11820732), (2, 0.56334686), (6, 0.061766386), (8, 0.24844208)]
doc :  967 [(1, 0.8203549), (4, 0.16651602)]
doc :  968 [(0, 0.012526847), (1, 0.01252717), (2, 0.0125287315), (3, 0.012526577), (4, 0.58770066), (5, 0.012529987), (6, 0.31207404), (7, 0.012527341), (8, 0.012529457), (9, 0.012529236)]
doc :  969 [(4, 0.9774721)]
doc :  970 [(0, 0.98082334)]
doc :  971 [(2, 0

doc :  1146 [(7, 0.9756551)]
doc :  1147 [(8, 0.9931222)]
doc :  1148 [(3, 0.96890116)]
doc :  1149 [(9, 0.9756597)]
doc :  1150 [(9, 0.9780161)]
doc :  1151 [(7, 0.9867525)]
doc :  1152 [(4, 0.9854698)]
doc :  1153 [(0, 0.7898167), (8, 0.17379855)]
doc :  1154 [(1, 0.36287686), (6, 0.61546075)]
doc :  1155 [(4, 0.12649697), (7, 0.8676974)]
doc :  1156 [(1, 0.652477), (9, 0.31892094)]
doc :  1157 [(3, 0.9411058), (6, 0.05355086)]
doc :  1158 [(3, 0.39962038), (9, 0.5430884)]
doc :  1159 [(9, 0.99570906)]
doc :  1160 [(6, 0.9437017)]
doc :  1161 [(6, 0.8409655), (9, 0.12700044)]
doc :  1162 [(2, 0.25893876), (4, 0.72747946)]
doc :  1163 [(6, 0.41817576), (9, 0.5469967)]
doc :  1164 [(0, 0.25835627), (6, 0.12937687), (9, 0.60278875)]
doc :  1165 [(0, 0.16825922), (2, 0.8074614)]
doc :  1166 [(4, 0.98198134)]
doc :  1167 [(4, 0.3340781), (5, 0.6086557)]
doc :  1168 [(0, 0.59026194), (5, 0.33685738)]
doc :  1169 [(4, 0.9549475)]
doc :  1170 [(6, 0.1489594), (7, 0.7893298)]
doc :  1171 [(3,

doc :  1359 [(6, 0.94700986)]
doc :  1360 [(0, 0.636279), (7, 0.35440984)]
doc :  1361 [(3, 0.9965105)]
doc :  1362 [(5, 0.8131305), (6, 0.17435504)]
doc :  1363 [(0, 0.020060303), (1, 0.020088186), (2, 0.020058746), (3, 0.020059573), (4, 0.020060176), (5, 0.020059856), (6, 0.020060306), (7, 0.81943303), (8, 0.020059237), (9, 0.020060617)]
doc :  1364 [(0, 0.6973367), (7, 0.29366538)]
doc :  1365 [(0, 0.30051926), (6, 0.6674568)]
doc :  1366 [(0, 0.96997666)]
doc :  1367 [(0, 0.13242093), (3, 0.83275694)]
doc :  1368 [(0, 0.9785547)]
doc :  1369 [(2, 0.96247137)]
doc :  1370 [(9, 0.9902058)]
doc :  1371 [(8, 0.95709)]
doc :  1372 [(9, 0.97802144)]
doc :  1373 [(0, 0.98713356)]
doc :  1374 [(2, 0.3646209), (8, 0.21226898), (9, 0.40849987)]
doc :  1375 [(4, 0.94366515)]
doc :  1376 [(4, 0.649241), (9, 0.3378426)]
doc :  1377 [(0, 0.43975678), (1, 0.5268844)]
doc :  1378 [(3, 0.5699012), (7, 0.14145546), (9, 0.2806726)]
doc :  1379 [(8, 0.9699695)]
doc :  1380 [(2, 0.24599999), (9, 0.7396

doc :  1582 [(0, 0.014325689), (1, 0.01432614), (2, 0.014332023), (3, 0.1562737), (4, 0.014326335), (5, 0.014325384), (6, 0.014336423), (7, 0.014328173), (8, 0.7290979), (9, 0.014328189)]
doc :  1583 [(0, 0.5285239), (2, 0.40984818)]
doc :  1584 [(7, 0.9808394)]
doc :  1585 [(2, 0.98765785)]
doc :  1586 [(1, 0.11999458), (2, 0.38471064), (8, 0.4854187)]
doc :  1587 [(5, 0.94992846)]
doc :  1588 [(4, 0.8014064), (6, 0.18762723)]
doc :  1589 [(2, 0.7302258), (7, 0.26383936)]
doc :  1590 [(0, 0.6039445), (2, 0.30207413), (8, 0.069817655)]
doc :  1591 [(8, 0.9964676)]
doc :  1592 [(7, 0.9780127)]
doc :  1593 [(2, 0.195163), (4, 0.79682535)]
doc :  1594 [(1, 0.22157925), (3, 0.74876547)]
doc :  1595 [(1, 0.94698125)]
doc :  1596 [(3, 0.55614007), (5, 0.17827019), (8, 0.23055187)]
doc :  1597 [(8, 0.9756542)]
doc :  1598 [(4, 0.96664053)]
doc :  1599 [(2, 0.930688)]
doc :  1600 [(6, 0.30740398), (8, 0.68482345)]
doc :  1601 [(1, 0.9608471)]
doc :  1602 [(0, 0.10794605), (3, 0.69138104), (6, 

doc :  1774 [(0, 0.8916117), (6, 0.096947655)]
doc :  1775 [(2, 0.64420503), (7, 0.35222)]
doc :  1776 [(9, 0.9499278)]
doc :  1777 [(0, 0.72888064), (7, 0.26222247)]
doc :  1778 [(1, 0.9913404)]
doc :  1779 [(3, 0.21614301), (7, 0.7826964)]
doc :  1780 [(8, 0.9762918)]
doc :  1781 [(3, 0.22822316), (7, 0.7706482)]
doc :  1782 [(3, 0.9799805)]
doc :  1783 [(1, 0.5686344), (3, 0.20619443), (5, 0.1933136)]
doc :  1784 [(5, 0.9571177)]
doc :  1785 [(0, 0.012506608), (1, 0.012505088), (2, 0.012510728), (3, 0.88743585), (4, 0.012506394), (5, 0.012507169), (6, 0.0125084175), (7, 0.012506294), (8, 0.012506701), (9, 0.012506759)]
doc :  1786 [(5, 0.989015)]
doc :  1787 [(2, 0.77917814), (3, 0.21807595)]
doc :  1788 [(5, 0.98234653)]
doc :  1789 [(2, 0.9718477)]
doc :  1790 [(5, 0.17712231), (9, 0.8017842)]
doc :  1791 [(0, 0.1), (1, 0.1), (2, 0.1), (3, 0.1), (4, 0.1), (5, 0.1), (6, 0.1), (7, 0.1), (8, 0.1), (9, 0.1)]
doc :  1792 [(0, 0.06912232), (4, 0.9265485)]
doc :  1793 [(7, 0.99150354)]
d

doc :  1980 [(3, 0.9942232)]
doc :  1981 [(3, 0.5958075), (5, 0.06100997), (6, 0.33556157)]
doc :  1982 [(9, 0.99165964)]
doc :  1983 [(7, 0.9639793)]
doc :  1984 [(0, 0.45692185), (7, 0.4815219), (8, 0.054479565)]
doc :  1985 [(0, 0.08020259), (3, 0.13943507), (4, 0.77355784)]
doc :  1986 [(9, 0.99158406)]
doc :  1987 [(5, 0.87973595), (9, 0.10358609)]
doc :  1988 [(2, 0.9857057)]
doc :  1989 [(1, 0.9699748)]
doc :  1990 [(3, 0.97748613)]
doc :  1991 [(5, 0.53277844), (8, 0.45733336)]
doc :  1992 [(0, 0.29985976), (9, 0.690377)]
doc :  1993 [(0, 0.6344037), (3, 0.315498)]
doc :  1994 [(1, 0.94996554)]
doc :  1995 [(4, 0.5797712), (8, 0.4151909)]
doc :  1996 [(0, 0.3356727), (6, 0.44638637), (7, 0.17899811)]
doc :  1997 [(6, 0.96534544)]
doc :  1998 [(2, 0.67892885), (4, 0.31817713)]
doc :  1999 [(3, 0.2513894), (5, 0.1341973), (6, 0.594383)]
doc :  2000 [(1, 0.519458), (7, 0.16593243), (8, 0.28766567)]
doc :  2001 [(0, 0.64914674), (1, 0.2624071), (2, 0.011054937), (3, 0.011056714), (

doc :  2109 [(0, 0.20689288), (2, 0.6678335), (8, 0.11275531)]
doc :  2110 [(1, 0.27797025), (2, 0.58964336), (5, 0.07051399), (9, 0.052237723)]
doc :  2111 [(0, 0.10589013), (4, 0.8784922)]
doc :  2112 [(0, 0.037172552), (2, 0.5896566), (4, 0.27518973), (7, 0.033474512), (8, 0.058544673)]
doc :  2113 [(8, 0.24039517), (9, 0.7442087)]
doc :  2114 [(1, 0.12413584), (2, 0.1820786), (6, 0.081644155), (9, 0.5971241)]
doc :  2115 [(4, 0.03060343), (5, 0.02867152), (6, 0.79441184), (9, 0.13611768)]
doc :  2116 [(2, 0.7146154), (4, 0.25875887)]
doc :  2117 [(5, 0.7978893), (6, 0.06201951), (8, 0.1241586)]
doc :  2118 [(2, 0.18295614), (3, 0.2078489), (5, 0.073029175), (8, 0.17635433), (9, 0.35297465)]
doc :  2119 [(2, 0.02968454), (5, 0.020444246), (6, 0.7297707), (7, 0.21186838)]
doc :  2120 [(2, 0.20713493), (5, 0.07557005), (8, 0.5229037), (9, 0.17673783)]
doc :  2121 [(0, 0.07923486), (1, 0.29739755), (3, 0.34267172), (5, 0.26914388)]
doc :  2122 [(2, 0.20717394), (5, 0.07555781), (8, 0.5

doc :  2246 [(0, 0.10506994), (3, 0.4616458), (5, 0.1582122), (8, 0.25685304)]
doc :  2247 [(2, 0.46779454), (3, 0.2089991), (4, 0.19002488), (9, 0.123947516)]
doc :  2248 [(2, 0.22546512), (3, 0.34001276), (6, 0.16862586), (7, 0.14452048), (8, 0.110264294)]
doc :  2249 [(0, 0.09249306), (2, 0.74123573), (5, 0.06670325), (7, 0.08928026)]
doc :  2250 [(0, 0.15943399), (3, 0.7262131), (4, 0.09912203)]
doc :  2251 [(0, 0.25592154), (3, 0.72868997)]
doc :  2252 [(4, 0.2609127), (6, 0.3272797), (8, 0.3858471)]
doc :  2253 [(0, 0.10198503), (3, 0.77637136), (5, 0.086606465)]
doc :  2254 [(4, 0.24843593), (5, 0.7352224)]
doc :  2255 [(3, 0.50451434), (6, 0.41658643)]
doc :  2256 [(0, 0.74555457), (2, 0.034841955), (3, 0.054399207), (7, 0.032774493), (8, 0.120542325)]
doc :  2257 [(0, 0.014297873), (1, 0.014297138), (2, 0.01429688), (3, 0.23336431), (4, 0.652251), (5, 0.014296793), (6, 0.014302613), (7, 0.014299835), (8, 0.01429696), (9, 0.014296552)]
doc :  2258 [(0, 0.39363188), (2, 0.157325

doc :  2358 [(0, 0.06278899), (3, 0.07279056), (4, 0.06629834), (5, 0.7179668), (8, 0.073854774)]
doc :  2359 [(1, 0.058063086), (2, 0.9286545)]
doc :  2360 [(2, 0.26709616), (3, 0.086605184), (4, 0.07804803), (5, 0.5627877)]
doc :  2361 [(0, 0.44096482), (8, 0.06711937), (9, 0.46529832)]
doc :  2362 [(0, 0.19078565), (4, 0.21614088), (8, 0.5671447)]
doc :  2363 [(0, 0.05515251), (1, 0.013637645), (2, 0.61088), (4, 0.21443683), (7, 0.014504217), (8, 0.08869117)]
doc :  2364 [(2, 0.13568066), (8, 0.05678627), (9, 0.7941044)]
doc :  2365 [(2, 0.44764078), (4, 0.44689906), (7, 0.08989676)]
doc :  2366 [(3, 0.18708846), (8, 0.21872145), (9, 0.5552483)]
doc :  2367 [(0, 0.35941383), (2, 0.17603025), (6, 0.22519864), (7, 0.026458636), (8, 0.20565143)]
doc :  2368 [(0, 0.26127356), (5, 0.06402882), (9, 0.6645466)]
doc :  2369 [(2, 0.37814432), (5, 0.11438401), (6, 0.48051056)]
doc :  2370 [(4, 0.67499804), (7, 0.30727324)]
doc :  2371 [(0, 0.2715016), (2, 0.6750894)]
doc :  2372 [(4, 0.638031

doc :  2467 [(1, 0.51474446), (2, 0.21470754), (4, 0.18131442), (8, 0.08026365)]
doc :  2468 [(2, 0.044634815), (6, 0.9455991)]
doc :  2469 [(0, 0.6872587), (1, 0.012156358), (2, 0.012154015), (3, 0.012155795), (4, 0.012154481), (5, 0.012153879), (6, 0.01215636), (7, 0.012154036), (8, 0.21550126), (9, 0.012155043)]
doc :  2470 [(4, 0.81891686), (5, 0.024584478), (7, 0.14689785)]
doc :  2471 [(0, 0.16817717), (2, 0.5098768), (3, 0.18411922), (4, 0.076955564), (6, 0.046150267)]
doc :  2472 [(1, 0.48970917), (4, 0.08333241), (5, 0.13651887), (8, 0.073498555), (9, 0.2053038)]
doc :  2473 [(1, 0.8687064), (9, 0.08420222)]
doc :  2474 [(0, 0.0835966), (1, 0.081900336), (6, 0.23736022), (8, 0.17925504), (9, 0.40320927)]
doc :  2475 [(0, 0.64639455), (1, 0.010007802), (2, 0.010006876), (3, 0.010007029), (4, 0.0100075435), (5, 0.27354777), (6, 0.010006396), (7, 0.010007129), (8, 0.010007857), (9, 0.010007097)]
doc :  2476 [(5, 0.5941915), (7, 0.34861314)]
doc :  2477 [(0, 0.02000236), (1, 0.059

doc :  2569 [(2, 0.116761684), (8, 0.24242684), (9, 0.623298)]
doc :  2570 [(0, 0.38050425), (9, 0.59367037)]
doc :  2571 [(2, 0.1915352), (4, 0.24290457), (9, 0.5506605)]
doc :  2572 [(0, 0.47384867), (2, 0.12039783), (3, 0.26683128), (7, 0.12938884)]
doc :  2573 [(0, 0.30193433), (9, 0.6509599)]
doc :  2574 [(2, 0.38959438), (4, 0.5755923)]
doc :  2575 [(3, 0.9451169)]
doc :  2576 [(9, 0.979063)]
doc :  2577 [(0, 0.17560504), (1, 0.010009569), (2, 0.010010501), (3, 0.0100118425), (4, 0.010007932), (5, 0.010008339), (6, 0.010008496), (7, 0.010006921), (8, 0.74432355), (9, 0.010007776)]
doc :  2578 [(3, 0.19852462), (6, 0.021883441), (9, 0.76883143)]
doc :  2579 [(2, 0.23900741), (3, 0.25449547), (5, 0.14067902), (6, 0.24460022), (8, 0.06663345), (9, 0.048745424)]
doc :  2580 [(2, 0.047658157), (4, 0.022600813), (5, 0.02684876), (8, 0.041867554), (9, 0.8522623)]
doc :  2581 [(2, 0.6394892), (4, 0.09095506), (5, 0.07411888), (7, 0.03592787), (8, 0.1555749)]
doc :  2582 [(9, 0.97749466)]

doc :  2700 [(0, 0.38385803), (3, 0.35489577), (7, 0.052099418), (8, 0.19338258)]
doc :  2701 [(1, 0.2900883), (6, 0.26285806), (7, 0.040208526), (8, 0.39407063)]
doc :  2702 [(2, 0.17243572), (3, 0.7734105), (4, 0.040663734)]
doc :  2703 [(2, 0.8968846), (6, 0.071303554)]
doc :  2704 [(1, 0.1778369), (2, 0.2919288), (3, 0.2758078), (6, 0.034539063), (9, 0.21082288)]
doc :  2705 [(2, 0.15632401), (8, 0.07305915), (9, 0.7590201)]
doc :  2706 [(0, 0.30751172), (3, 0.27159944), (6, 0.21259624), (7, 0.18829769)]
doc :  2707 [(0, 0.34964612), (2, 0.04765207), (4, 0.13696274), (9, 0.45296067)]
doc :  2708 [(0, 0.011084161), (1, 0.5156114), (2, 0.011083377), (3, 0.011083673), (4, 0.011083469), (5, 0.39572042), (6, 0.011083325), (7, 0.011082832), (8, 0.011083612), (9, 0.011083689)]
doc :  2709 [(2, 0.3875527), (4, 0.06072452), (7, 0.012006693), (8, 0.47950777), (9, 0.049565364)]
doc :  2710 [(1, 0.19976225), (2, 0.21192725), (3, 0.55690503)]
doc :  2711 [(1, 0.30562562), (2, 0.63864017), (9, 0

doc :  2814 [(1, 0.80656993), (6, 0.16139813)]
doc :  2815 [(0, 0.7546205), (1, 0.15734853), (2, 0.067423716)]
doc :  2816 [(1, 0.03332149), (2, 0.13995244), (4, 0.13084486), (5, 0.36509854), (9, 0.31761628)]
doc :  2817 [(0, 0.6840447), (2, 0.06140273), (3, 0.23330587)]
doc :  2818 [(0, 0.595542), (1, 0.0111355195), (2, 0.011136294), (3, 0.01113911), (4, 0.011135884), (5, 0.01113494), (6, 0.011138529), (7, 0.31535858), (8, 0.011141731), (9, 0.011137411)]
doc :  2819 [(4, 0.15346944), (5, 0.037833523), (7, 0.10326376), (9, 0.6952478)]
doc :  2820 [(0, 0.011118551), (1, 0.011120393), (2, 0.0111195315), (3, 0.011120669), (4, 0.15286137), (5, 0.1341942), (6, 0.63510907), (7, 0.011118671), (8, 0.011118572), (9, 0.011119044)]
doc :  2821 [(6, 0.4454586), (9, 0.51822174)]
doc :  2822 [(2, 0.23875234), (3, 0.2605089), (6, 0.03805341), (8, 0.4451969)]
doc :  2823 [(4, 0.23570706), (5, 0.40519178), (6, 0.22262733), (9, 0.12513812)]
doc :  2824 [(3, 0.87112004), (4, 0.034045774), (9, 0.07805619)

doc :  2933 [(3, 0.14654262), (5, 0.5441808), (7, 0.29219314)]
doc :  2934 [(7, 0.95022196)]
doc :  2935 [(0, 0.15057203), (6, 0.47162744), (9, 0.36812755)]
doc :  2936 [(2, 0.15649894), (3, 0.4820245), (5, 0.21465029), (8, 0.12679932)]
doc :  2937 [(3, 0.27112344), (7, 0.4226749), (9, 0.28285977)]
doc :  2938 [(2, 0.13688919), (3, 0.5225409), (5, 0.2786115), (7, 0.05064153)]
doc :  2939 [(4, 0.81632644), (5, 0.086585745), (7, 0.07209648)]
doc :  2940 [(7, 0.32472062), (8, 0.62817305)]
doc :  2941 [(0, 0.20444156), (2, 0.37426835), (4, 0.079148285), (7, 0.32281402)]
doc :  2942 [(0, 0.85341567), (5, 0.041368347), (8, 0.09314011)]
doc :  2943 [(0, 0.88593024), (6, 0.023725703), (9, 0.077593744)]
doc :  2944 [(0, 0.7675301), (1, 0.05690095), (3, 0.026720628), (6, 0.042310897), (8, 0.098585166)]
doc :  2945 [(5, 0.50979716), (7, 0.07249563), (8, 0.20155548), (9, 0.18405384)]
doc :  2946 [(0, 0.22354354), (3, 0.06735502), (5, 0.30032244), (6, 0.38732633)]
doc :  2947 [(1, 0.117285065), (2,

doc :  3042 [(2, 0.35217258), (4, 0.049240984), (5, 0.35027888), (6, 0.1852106), (9, 0.050340086)]
doc :  3043 [(1, 0.4667378), (2, 0.2960501), (6, 0.21780993)]
doc :  3044 [(4, 0.3334898), (8, 0.55455184), (9, 0.094142616)]
doc :  3045 [(2, 0.50781524), (3, 0.10137185), (5, 0.106120154), (6, 0.05779585), (9, 0.21526559)]
doc :  3046 [(1, 0.06863026), (3, 0.51267606), (4, 0.3980865)]
doc :  3047 [(2, 0.26514176), (5, 0.47280598), (6, 0.10308683), (8, 0.09295804), (9, 0.05761642)]
doc :  3048 [(3, 0.11710323), (4, 0.5112037), (6, 0.09995192), (8, 0.18586504), (9, 0.06992493)]
doc :  3049 [(2, 0.068199575), (4, 0.16906902), (5, 0.49866578), (8, 0.24699308)]
doc :  3050 [(1, 0.22238551), (2, 0.21304815), (4, 0.36266464), (8, 0.13074163), (9, 0.061035745)]
doc :  3051 [(0, 0.019888056), (1, 0.019893363), (2, 0.019892242), (3, 0.5642596), (4, 0.019891737), (5, 0.019895677), (6, 0.27660096), (7, 0.019891718), (8, 0.019895554), (9, 0.019891089)]
doc :  3052 [(1, 0.020923113), (2, 0.28715885),

doc :  3183 [(3, 0.06962588), (7, 0.9084051)]
doc :  3184 [(1, 0.040693015), (3, 0.34851992), (4, 0.15938258), (5, 0.063585676), (7, 0.3155347), (9, 0.06938314)]
doc :  3185 [(0, 0.9013266), (1, 0.010963599), (2, 0.010963281), (3, 0.010964752), (4, 0.010966322), (5, 0.0109647205), (6, 0.010963417), (7, 0.010962251), (8, 0.010962506), (9, 0.010962602)]
doc :  3186 [(1, 0.050709642), (3, 0.100936435), (4, 0.14798829), (5, 0.2457869), (7, 0.20301528), (8, 0.13957961), (9, 0.107935585)]
doc :  3187 [(2, 0.28781563), (5, 0.35299203), (7, 0.15595075), (8, 0.18905206)]
doc :  3188 [(0, 0.3713467), (1, 0.34128734), (4, 0.26985636)]
doc :  3189 [(2, 0.062991425), (4, 0.61587065), (5, 0.25064498), (7, 0.054497655)]
doc :  3190 [(2, 0.57305545), (4, 0.11460615), (7, 0.05989429), (8, 0.2386208)]
doc :  3191 [(0, 0.010296197), (1, 0.9073322), (2, 0.010298112), (3, 0.010297408), (4, 0.010296479), (5, 0.01029627), (6, 0.010295432), (7, 0.010296735), (8, 0.010295218), (9, 0.010295974)]
doc :  3192 [(0

doc :  3295 [(1, 0.5257974), (2, 0.089577615), (4, 0.29430875), (8, 0.034636185), (9, 0.048821654)]
doc :  3296 [(1, 0.10448303), (3, 0.056097202), (4, 0.6515265), (9, 0.17505376)]
doc :  3297 [(0, 0.011000228), (1, 0.9009938), (2, 0.010996991), (3, 0.011002194), (4, 0.011000209), (5, 0.011001523), (6, 0.011003521), (7, 0.010999504), (8, 0.011001339), (9, 0.01100067)]
doc :  3298 [(3, 0.09887937), (4, 0.6285275), (8, 0.23864642)]
doc :  3299 [(1, 0.06730629), (2, 0.0782796), (4, 0.24667919), (5, 0.072240695), (8, 0.14332332), (9, 0.38156962)]
doc :  3300 [(1, 0.7598901), (2, 0.13561763), (6, 0.026725246), (7, 0.06989283)]
doc :  3301 [(0, 0.055487443), (1, 0.16168362), (3, 0.25702602), (4, 0.39577883), (9, 0.117430486)]
doc :  3302 [(0, 0.1352642), (5, 0.2624202), (8, 0.29661086), (9, 0.2860171)]
doc :  3303 [(1, 0.43442512), (2, 0.05851227), (4, 0.35619292), (9, 0.12918068)]
doc :  3304 [(3, 0.21315932), (4, 0.21081755), (6, 0.017907083), (7, 0.5307049), (9, 0.020996097)]
doc :  3305 

doc :  3403 [(3, 0.12242049), (4, 0.12155463), (8, 0.4948834), (9, 0.24492297)]
doc :  3404 [(0, 0.23345254), (1, 0.17443874), (4, 0.19137363), (5, 0.2150098), (9, 0.17412396)]
doc :  3405 [(2, 0.18862894), (3, 0.16038296), (4, 0.30211386), (8, 0.32475808)]
doc :  3406 [(1, 0.4031106), (8, 0.41773924), (9, 0.14657028)]
doc :  3407 [(2, 0.49754968), (3, 0.28026637), (4, 0.03981001), (7, 0.06408813), (9, 0.10995523)]
doc :  3408 [(8, 0.2883363), (9, 0.6867176)]
doc :  3409 [(0, 0.16307668), (1, 0.13066477), (7, 0.2602924), (9, 0.42718074)]
doc :  3410 [(0, 0.05478018), (2, 0.20582104), (5, 0.30511308), (8, 0.085374966), (9, 0.33237535)]
doc :  3411 [(4, 0.29503644), (5, 0.2274669), (6, 0.22103554), (9, 0.23259954)]
doc :  3412 [(0, 0.15835522), (1, 0.37295994), (2, 0.012360918), (3, 0.012362992), (4, 0.012363438), (5, 0.012363288), (6, 0.012361727), (7, 0.38214916), (8, 0.0123602), (9, 0.012363093)]
doc :  3413 [(0, 0.13762312), (1, 0.21901746), (4, 0.051407017), (5, 0.26530525), (6, 0.2

doc :  3518 [(4, 0.6125364), (5, 0.3442705)]
doc :  3519 [(3, 0.076270275), (6, 0.7259916), (8, 0.16419017)]
doc :  3520 [(4, 0.47041306), (5, 0.10999874), (8, 0.19708797), (9, 0.21064222)]
doc :  3521 [(1, 0.279532), (4, 0.42166284), (7, 0.17084658), (8, 0.096451074)]
doc :  3522 [(0, 0.079251096), (3, 0.2108574), (4, 0.22775906), (6, 0.45646688)]
doc :  3523 [(2, 0.21566379), (3, 0.5196524), (4, 0.10272379), (7, 0.06437056), (8, 0.020350497), (9, 0.07479014)]
doc :  3524 [(0, 0.012505678), (1, 0.88744354), (2, 0.012505711), (3, 0.012507266), (4, 0.0125059085), (5, 0.012507293), (6, 0.012509762), (7, 0.012503871), (8, 0.012505796), (9, 0.012505206)]
doc :  3525 [(2, 0.44229573), (3, 0.27796736), (5, 0.1430803), (8, 0.11721313)]
doc :  3526 [(0, 0.4343096), (7, 0.49935418)]
doc :  3527 [(0, 0.40650952), (8, 0.2759506), (9, 0.2808258)]
doc :  3528 [(0, 0.34343717), (2, 0.103778675), (3, 0.49528962)]
doc :  3529 [(2, 0.19137236), (3, 0.73216194), (9, 0.058667578)]
doc :  3530 [(1, 0.2107

doc :  3628 [(1, 0.05964611), (2, 0.2130851), (4, 0.12595639), (5, 0.02551942), (7, 0.18361498), (9, 0.3875639)]
doc :  3629 [(2, 0.13249835), (3, 0.03478205), (4, 0.0813935), (5, 0.20195211), (6, 0.16843869), (7, 0.03560127), (8, 0.11380905), (9, 0.22760488)]
doc :  3630 [(2, 0.11062948), (7, 0.40062463), (9, 0.465749)]
doc :  3631 [(7, 0.20701002), (8, 0.29801375), (9, 0.4821813)]
doc :  3632 [(0, 0.276104), (2, 0.28584006), (4, 0.10385409), (5, 0.05397507), (8, 0.22535224), (9, 0.044849224)]
doc :  3633 [(0, 0.084666155), (1, 0.35029215), (4, 0.30763462), (7, 0.23241946)]
doc :  3634 [(0, 0.1612364), (1, 0.010531208), (2, 0.010528611), (3, 0.010527956), (4, 0.27203843), (5, 0.4930217), (6, 0.010529662), (7, 0.010527454), (8, 0.010529803), (9, 0.010528812)]
doc :  3635 [(1, 0.3184992), (4, 0.1947696), (7, 0.1695822), (9, 0.29163545)]
doc :  3636 [(0, 0.2258889), (2, 0.25332204), (3, 0.21324266), (5, 0.28252092)]
doc :  3637 [(0, 0.38627484), (6, 0.58764464)]
doc :  3638 [(0, 0.565913

doc :  3730 [(4, 0.3435393), (7, 0.25280207), (9, 0.37625316)]
doc :  3731 [(4, 0.6621877), (9, 0.29674518)]
doc :  3732 [(1, 0.24157092), (4, 0.42432007), (5, 0.17531668), (6, 0.1351672)]
doc :  3733 [(0, 0.05444838), (4, 0.14441794), (8, 0.15146777), (9, 0.63309366)]
doc :  3734 [(2, 0.3867087), (7, 0.0822068), (9, 0.5019433)]
doc :  3735 [(1, 0.359937), (3, 0.1563264), (5, 0.27264136), (9, 0.17968637)]
doc :  3736 [(1, 0.495563), (3, 0.18905866), (6, 0.09652286), (8, 0.20682621)]
doc :  3737 [(0, 0.6292142), (7, 0.32342193)]
doc :  3738 [(0, 0.14156212), (1, 0.040690966), (3, 0.33005038), (8, 0.17145982), (9, 0.3087773)]
doc :  3739 [(0, 0.03864288), (2, 0.33911422), (3, 0.030513637), (4, 0.21337801), (5, 0.074111454), (7, 0.02907684), (9, 0.27136704)]
doc :  3740 [(2, 0.10864311), (3, 0.6022147), (4, 0.034234412), (6, 0.02872102), (7, 0.1987534), (9, 0.022126257)]
doc :  3741 [(0, 0.37849614), (2, 0.16056933), (7, 0.23633729), (8, 0.19260883)]
doc :  3742 [(1, 0.37241915), (6, 0.23

doc :  3844 [(0, 0.4027255), (2, 0.37183762), (3, 0.20912158)]
doc :  3845 [(1, 0.67589116), (4, 0.078177035), (9, 0.2190893)]
doc :  3846 [(1, 0.43422344), (4, 0.40614507), (8, 0.12152239)]
doc :  3847 [(0, 0.5566607), (1, 0.16678885), (5, 0.2509451)]
doc :  3848 [(2, 0.21502472), (3, 0.13124295), (4, 0.3053413), (8, 0.15155546), (9, 0.1850839)]
doc :  3849 [(1, 0.37709755), (4, 0.46776986), (9, 0.13160734)]
doc :  3850 [(1, 0.16305417), (2, 0.33680704), (6, 0.36488554), (7, 0.05017426), (9, 0.070932224)]
doc :  3851 [(4, 0.026426807), (6, 0.07297011), (8, 0.04500817), (9, 0.84937847)]
doc :  3852 [(2, 0.34712943), (3, 0.29364595), (5, 0.31985185)]
doc :  3853 [(0, 0.1295524), (2, 0.31539837), (4, 0.46916083), (6, 0.06149338)]
doc :  3854 [(0, 0.18500821), (2, 0.25466943), (4, 0.38645566), (5, 0.1289887), (6, 0.035559647)]
doc :  3855 [(1, 0.018507997), (2, 0.10804754), (3, 0.3972863), (4, 0.047494404), (6, 0.0132140815), (7, 0.12603548), (9, 0.28792766)]
doc :  3856 [(0, 0.08072931),

doc :  3958 [(0, 0.06392269), (1, 0.24998434), (2, 0.3553172), (3, 0.101098955), (4, 0.14422427), (9, 0.08052529)]
doc :  3959 [(2, 0.5692711), (9, 0.39087442)]
doc :  3960 [(1, 0.30505943), (2, 0.20082031), (4, 0.4679809)]
doc :  3961 [(2, 0.25730518), (4, 0.06609933), (5, 0.6064897), (6, 0.03065339), (8, 0.03065982)]
doc :  3962 [(2, 0.08161634), (3, 0.1882696), (5, 0.06630047), (7, 0.4474775), (9, 0.20393683)]
doc :  3963 [(0, 0.06405211), (5, 0.6832235), (7, 0.08339469), (8, 0.15887727)]
doc :  3964 [(2, 0.14608604), (3, 0.14461143), (4, 0.07045235), (5, 0.20697899), (7, 0.4101368)]
doc :  3965 [(0, 0.015253053), (1, 0.015247951), (2, 0.23408501), (3, 0.015248676), (4, 0.015250681), (5, 0.015253185), (6, 0.0152489375), (7, 0.015247999), (8, 0.6439114), (9, 0.015253058)]
doc :  3966 [(1, 0.34348413), (2, 0.120468155), (7, 0.48783913)]
doc :  3967 [(0, 0.13589558), (1, 0.7053759), (2, 0.06470263), (6, 0.08626524)]
doc :  3968 [(3, 0.20565182), (7, 0.04035053), (8, 0.7357588)]
doc :  