In [15]:
### @credit: gensim tutorial on LDA and ensembleLda
import gensim
from spacy.lang.en.stop_words import STOP_WORDS
def tokenize(sentence):
    return [token for token in sentence.split() if token not in STOP_WORDS]
import re
def clean_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
    return re.sub(r'\s{2,}', ' ', sentence)
from gensim.models.phrases import Phrases, Phraser
def build_phrases(sentences):
    phrases = Phrases(sentences,
                      min_count=1,
                      threshold=10,
                      progress_per=1000)
    return Phraser(phrases)
def sentence_to_bi_grams(phrases_model, sentence):
    return ' '.join(phrases_model[sentence])


In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
for handler in logger.handlers:
    handler.setLevel(logging.DEBUG)

In [3]:
p = []
with open("phrases.txt") as f:
    for line in f:
        p.append(line.split())
p = dict(p)

In [4]:
docs = []
with open("textretrieval.txt", encoding='utf-8') as f:
    for line in f:
        docs.append(line.split('.'))
with open("textanalytics.txt") as f:
    for line in f:
        docs.append(line.split('.'))
with open("noncs.dat", encoding='utf-8') as f:
    for line in f:
        docs.append(line.split('.'))
with open("cs125.dat", encoding='utf-8') as f:
    for line in f:
        docs.append(line.split('.'))
with open("bkgd.dat", encoding='utf-8') as f:
    for line in f:
        docs.append(line.split('.'))
print(len(docs))
docs = [[clean_sentence(s) for s in sentences] for sentences in docs]
docs = [[tokenize(s) for s in sentences] for sentences in docs]

phrased = []
for sentences in docs:
    ps = []
    for s in sentences:
        i = 0
        new_s = []
        while i < len(s) - 1:
            if (s[i+1] == p.get(s[i], False)):
                new_s.append(s[i] + "_" + s[i+1])
                i+=2
            else:
                new_s.append(s[i])
                i+=1
        ps.append(new_s)
    phrased.append(ps)
len(phrased)

160


160

In [5]:
phrased_doc = []
for doc in phrased:
    d = []
    for s in doc:
        d+=s
    phrased_doc.append(d)
phrased_doc

[['lecture',
  'natural_language',
  'content',
  'picture',
  'step',
  'process',
  'text',
  'data',
  'text',
  'data',
  'natural',
  'computers',
  'understand',
  'natural_language',
  'extent',
  'order',
  'use',
  'thats',
  'topic',
  'going',
  'cover',
  'natural_language',
  'processing',
  'main',
  'technique',
  'processing',
  'natural_language',
  'obtain',
  'understanding',
  'second',
  'state',
  'art',
  'nlp',
  'stands',
  'natural_language',
  'finally',
  'going',
  'cover',
  'relation',
  'natural_language',
  'processing',
  'text',
  'nlp',
  'best',
  'way',
  'explain',
  'think',
  'text',
  'foreign',
  'language',
  'order',
  'understand',
  'text',
  'basically',
  'computers',
  'facing',
  'right',
  'looking',
  'simple',
  'sentence',
  'like',
  'dog',
  'chasing',
  'boy',
  'dont',
  'problem',
  'understanding',
  'imagine',
  'computer',
  'order',
  'understand',
  'general',
  'know',
  'dogs',
  'noun',
  'chasing',
  'verb',
  'called

In [21]:
# with open("miss.dat", "w") as f:
#     for doc in phrased_doc:
#         f.write(" ".join(doc)+"\n")

In [7]:
from gensim.models import LdaModel

In [8]:
from gensim.corpora import Dictionary
import random

dictionary = Dictionary(phrased_doc)
dictionary.filter_extremes(no_below=2, no_above=0.1)

corpus = [dictionary.doc2bow(doc) for doc in phrased_doc]
num_topics = 86*2
chunksize = 160 # how many documents to process at a time
passes = 20 # epochs
iterations = 30
eval_every = 10

# Make a index to word dictionary.
temp = dictionary[0] 
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)


2021-12-03 19:55:44,027 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2021-12-03 19:55:44,129 : INFO : built Dictionary<12250 unique tokens: ['1st', '90', '97', 'able', 'accuracy']...> from 160 documents (total 216470 corpus positions)
2021-12-03 19:55:44,130 : DEBUG : starting a new internal lifecycle event log for Dictionary
2021-12-03 19:55:44,130 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12250 unique tokens: ['1st', '90', '97', 'able', 'accuracy']...> from 160 documents (total 216470 corpus positions)", 'datetime': '2021-12-03T19:55:44.130335', 'gensim': '4.1.3.dev0', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
2021-12-03 19:55:44,138 : INFO : discarding 6758 tokens: [('able', 87), ('accuracy', 17), ('achieve', 35), ('acquired', 1), ('act', 18), ('actually', 136), ('add', 61), ('additional', 36), ('allow', 73), ('alright', 46)]...
2021-12-03 19:55:44

2021-12-03 19:55:44,829 : DEBUG : bound: at document #0
2021-12-03 19:55:45,046 : INFO : -18.707 per-word bound, 427826.7 perplexity estimate based on a held-out corpus of 160 documents with 51688 words
2021-12-03 19:55:45,047 : INFO : PROGRESS: pass 1, at document #160/160
2021-12-03 19:55:45,047 : DEBUG : performing inference on a chunk of 160 documents
2021-12-03 19:55:45,173 : DEBUG : 130/160 documents converged within 30 iterations
2021-12-03 19:55:45,176 : INFO : optimized alpha [0.006600806, 0.0064952886, 0.006503571, 0.0067020403, 0.0062506893, 0.006279826, 0.0063097854, 0.0068702437, 0.007213558, 0.0069002435, 0.0061256248, 0.0065057506, 0.006395511, 0.006257037, 0.006176531, 0.0061140615, 0.006417813, 0.0064625302, 0.0061320076, 0.0068775136, 0.0062315864, 0.006044211, 0.0067969863, 0.00635977, 0.006231343, 0.0063282778, 0.0063077216, 0.006532766, 0.0062872088, 0.0062205745, 0.0062796585, 0.0061138673, 0.0063213115, 0.006379321, 0.006215015, 0.0062766555, 0.0063834665, 0.0064

2021-12-03 19:55:45,980 : INFO : -10.492 per-word bound, 1440.1 perplexity estimate based on a held-out corpus of 160 documents with 51688 words
2021-12-03 19:55:45,981 : INFO : PROGRESS: pass 3, at document #160/160
2021-12-03 19:55:45,981 : DEBUG : performing inference on a chunk of 160 documents
2021-12-03 19:55:46,071 : DEBUG : 158/160 documents converged within 30 iterations
2021-12-03 19:55:46,074 : INFO : optimized alpha [0.0068602543, 0.0067366194, 0.006275971, 0.006858976, 0.006185642, 0.0060674874, 0.0062448983, 0.007490624, 0.008494921, 0.0070361416, 0.0059235, 0.0065267016, 0.0061753453, 0.0060835914, 0.006006783, 0.005912692, 0.0061961226, 0.006661169, 0.0060023894, 0.0072014113, 0.0060224677, 0.005847379, 0.007180474, 0.006142035, 0.0060222414, 0.006112676, 0.0060935067, 0.0066120015, 0.006074374, 0.006012188, 0.0061593335, 0.0059125107, 0.0061061834, 0.00616026, 0.0060069994, 0.0060645295, 0.006541659, 0.0067722714, 0.00597827, 0.006338783, 0.006477747, 0.0062622204, 0.0

2021-12-03 19:55:46,839 : INFO : PROGRESS: pass 5, at document #160/160
2021-12-03 19:55:46,840 : DEBUG : performing inference on a chunk of 160 documents
2021-12-03 19:55:46,926 : DEBUG : 157/160 documents converged within 30 iterations
2021-12-03 19:55:46,929 : INFO : optimized alpha [0.007040431, 0.006987608, 0.0060996828, 0.006987578, 0.006165877, 0.0059026387, 0.0061933165, 0.008029882, 0.009690902, 0.007146045, 0.00576633, 0.0066258325, 0.006004625, 0.0059481706, 0.005874147, 0.0057560913, 0.0060242605, 0.006841523, 0.005900507, 0.007399393, 0.0058600395, 0.005694197, 0.007523428, 0.005973139, 0.005859825, 0.005945379, 0.005927251, 0.0066767805, 0.0059091523, 0.005850308, 0.0060954625, 0.0057559214, 0.005939239, 0.005990367, 0.0058453986, 0.005899839, 0.0066718524, 0.0070214584, 0.0058181994, 0.0064085606, 0.006546906, 0.0062250616, 0.007654893, 0.0061250837, 0.005994511, 0.0063975113, 0.006143527, 0.005907283, 0.005970617, 0.005736421, 0.0056317807, 0.0065002157, 0.006617995, 0.

2021-12-03 19:55:47,820 : DEBUG : 160/160 documents converged within 30 iterations
2021-12-03 19:55:47,823 : INFO : optimized alpha [0.007213145, 0.007224335, 0.0059541394, 0.007098793, 0.006149428, 0.005766293, 0.006150026, 0.008482551, 0.01081337, 0.0072569503, 0.0056361733, 0.006725663, 0.0058635557, 0.005835663, 0.0057639135, 0.0056263944, 0.0058822725, 0.0070430753, 0.005815452, 0.0075717834, 0.005725644, 0.0055672578, 0.007827563, 0.0058335364, 0.0057254382, 0.0058070626, 0.0057897726, 0.0067326124, 0.005772507, 0.0057163546, 0.0060554356, 0.005626231, 0.005801207, 0.0058499626, 0.005711669, 0.0057636225, 0.006784816, 0.007224819, 0.005685704, 0.0064412197, 0.0066066994, 0.006207905, 0.007994531, 0.005978333, 0.0058799745, 0.006429664, 0.005995896, 0.005796153, 0.0058311312, 0.005607604, 0.005507594, 0.00656037, 0.0066758166, 0.0069515747, 0.005822423, 0.0058051934, 0.006582569, 0.005877975, 0.0070080115, 0.008688743, 0.005966058, 0.0067096655, 0.007821655, 0.0076759052, 0.006324

2021-12-03 19:55:48,675 : DEBUG : updating topics
2021-12-03 19:55:48,775 : INFO : topic #50 (0.005): 0.001*"z" + 0.000*"bar" + 0.000*"sampling" + 0.000*"education" + 0.000*"population" + 0.000*"declared" + 0.000*"cluster" + 0.000*"primitive" + 0.000*"statistics" + 0.000*"caller"
2021-12-03 19:55:48,776 : INFO : topic #49 (0.005): 0.001*"nodes" + 0.001*"node" + 0.001*"pivot" + 0.001*"root" + 0.001*"subtree" + 0.001*"population" + 0.001*"recursive" + 0.001*"merge" + 0.001*"z" + 0.001*"sorted"
2021-12-03 19:55:48,777 : INFO : topic #87 (0.009): 0.036*"pages" + 0.018*"challenges" + 0.018*"web_search" + 0.012*"york" + 0.011*"playing" + 0.009*"listen" + 0.009*"updated" + 0.008*"speaker" + 0.008*"established" + 0.007*"party"
2021-12-03 19:55:48,777 : INFO : topic #8 (0.012): 0.015*"fear" + 0.014*"waiting" + 0.012*"enhanced" + 0.011*"net" + 0.010*"airplane" + 0.009*"yellow" + 0.009*"scared" + 0.009*"meet" + 0.008*"quot" + 0.008*"crazy"
2021-12-03 19:55:48,778 : INFO : topic #104 (0.013): 0.08

2021-12-03 19:55:49,635 : INFO : topic #21 (0.005): 0.000*"internet" + 0.000*"dog" + 0.000*"pet" + 0.000*"ip" + 0.000*"bar" + 0.000*"protocol" + 0.000*"z" + 0.000*"constructor" + 0.000*"population" + 0.000*"beta_values"
2021-12-03 19:55:49,635 : INFO : topic #87 (0.010): 0.036*"pages" + 0.018*"challenges" + 0.018*"web_search" + 0.011*"york" + 0.011*"playing" + 0.009*"updated" + 0.009*"listen" + 0.008*"speaker" + 0.008*"established" + 0.007*"party"
2021-12-03 19:55:49,636 : INFO : topic #8 (0.013): 0.015*"fear" + 0.014*"waiting" + 0.012*"enhanced" + 0.011*"net" + 0.010*"airplane" + 0.009*"yellow" + 0.009*"scared" + 0.009*"meet" + 0.008*"quot" + 0.008*"blocks"
2021-12-03 19:55:49,636 : INFO : topic #104 (0.014): 0.088*"node" + 0.085*"nodes" + 0.075*"subtree" + 0.054*"root" + 0.043*"recursive" + 0.029*"factorial" + 0.023*"leaf" + 0.021*"base" + 0.017*"rooted" + 0.015*"trees"
2021-12-03 19:55:49,639 : INFO : topic diff=1.557276, rho=0.277350
2021-12-03 19:55:49,696 : DEBUG : bound: at docu

2021-12-03 19:55:50,491 : INFO : topic #87 (0.010): 0.036*"pages" + 0.018*"web_search" + 0.018*"challenges" + 0.011*"york" + 0.011*"playing" + 0.009*"updated" + 0.009*"listen" + 0.008*"speaker" + 0.008*"established" + 0.007*"party"
2021-12-03 19:55:50,491 : INFO : topic #8 (0.014): 0.015*"fear" + 0.014*"waiting" + 0.012*"enhanced" + 0.011*"net" + 0.010*"airplane" + 0.009*"yellow" + 0.009*"scared" + 0.009*"meet" + 0.008*"quot" + 0.008*"blocks"
2021-12-03 19:55:50,492 : INFO : topic #104 (0.015): 0.089*"node" + 0.086*"nodes" + 0.076*"subtree" + 0.054*"root" + 0.043*"recursive" + 0.030*"factorial" + 0.023*"leaf" + 0.021*"base" + 0.018*"rooted" + 0.015*"trees"
2021-12-03 19:55:50,494 : INFO : topic diff=0.910397, rho=0.258199
2021-12-03 19:55:50,552 : DEBUG : bound: at document #0
2021-12-03 19:55:50,722 : INFO : -8.785 per-word bound, 441.2 perplexity estimate based on a held-out corpus of 160 documents with 51688 words
2021-12-03 19:55:50,723 : INFO : PROGRESS: pass 14, at document #160/

2021-12-03 19:55:51,330 : INFO : topic #8 (0.015): 0.015*"fear" + 0.014*"waiting" + 0.012*"enhanced" + 0.011*"net" + 0.010*"airplane" + 0.009*"yellow" + 0.009*"scared" + 0.009*"meet" + 0.008*"quot" + 0.008*"blocks"
2021-12-03 19:55:51,330 : INFO : topic #104 (0.016): 0.090*"node" + 0.087*"nodes" + 0.077*"subtree" + 0.054*"root" + 0.044*"recursive" + 0.030*"factorial" + 0.023*"leaf" + 0.022*"base" + 0.018*"rooted" + 0.015*"trees"
2021-12-03 19:55:51,333 : INFO : topic diff=0.542155, rho=0.242536
2021-12-03 19:55:51,397 : DEBUG : bound: at document #0
2021-12-03 19:55:51,565 : INFO : -8.783 per-word bound, 440.4 perplexity estimate based on a held-out corpus of 160 documents with 51688 words
2021-12-03 19:55:51,565 : INFO : PROGRESS: pass 16, at document #160/160
2021-12-03 19:55:51,565 : DEBUG : performing inference on a chunk of 160 documents
2021-12-03 19:55:51,647 : DEBUG : 159/160 documents converged within 30 iterations
2021-12-03 19:55:51,650 : INFO : optimized alpha [0.00782265, 

2021-12-03 19:55:52,171 : INFO : topic #104 (0.017): 0.090*"node" + 0.088*"nodes" + 0.077*"subtree" + 0.054*"root" + 0.044*"recursive" + 0.030*"factorial" + 0.023*"leaf" + 0.022*"base" + 0.018*"rooted" + 0.015*"trees"
2021-12-03 19:55:52,173 : INFO : topic diff=0.330385, rho=0.229416
2021-12-03 19:55:52,231 : DEBUG : bound: at document #0
2021-12-03 19:55:52,402 : INFO : -8.778 per-word bound, 439.0 perplexity estimate based on a held-out corpus of 160 documents with 51688 words
2021-12-03 19:55:52,403 : INFO : PROGRESS: pass 18, at document #160/160
2021-12-03 19:55:52,403 : DEBUG : performing inference on a chunk of 160 documents
2021-12-03 19:55:52,484 : DEBUG : 158/160 documents converged within 30 iterations
2021-12-03 19:55:52,487 : INFO : optimized alpha [0.007934996, 0.008440504, 0.005414969, 0.0076723523, 0.0061836564, 0.0052592773, 0.005984969, 0.010496691, 0.016284537, 0.0077123847, 0.0051508993, 0.007199707, 0.0053400034, 0.005488888, 0.0053499513, 0.0051427376, 0.005355509

2021-12-03 19:55:53,024 : INFO : topic diff=0.206339, rho=0.218218
2021-12-03 19:55:53,054 : DEBUG : starting a new internal lifecycle event log for LdaModel
2021-12-03 19:55:53,054 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel<num_terms=5492, num_topics=172, decay=0.5, chunksize=160> in 8.79s', 'datetime': '2021-12-03T19:55:53.053834', 'gensim': '4.1.3.dev0', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [17]:
### after word2vec skimming
reduced = []
with open("reducedretrieval.dat", encoding='utf-8') as f:
    for line in f:
        reduced.append(line.split('.'))
with open("reducedanalytics.dat", encoding='utf-8') as f:
    for line in f:
        reduced.append(line.split('.'))
reduced = [[clean_sentence(s) for s in sentences] for sentences in reduced]
reduced = [[tokenize(s) for s in sentences] for sentences in reduced]

phrased = []
for sentences in docs:
    ps = []
    for s in sentences:
        i = 0
        new_s = []
        while i < len(s) - 1:
            if (s[i+1] == p.get(s[i], False)):
                new_s.append(s[i] + "_" + s[i+1])
                i+=2
            else:
                new_s.append(s[i])
                i+=1
        ps.append(new_s)
    phrased.append(ps)

phrased_doc = []
for doc in phrased:
    d = []
    for s in doc:
        d+=s
    phrased_doc.append(d)
reduced_corpus = [dictionary.doc2bow(doc) for doc in phrased_doc]

In [22]:

print(len(phrased_doc))
i = 17
i-=1
top_topics = model.get_document_topics(reduced_corpus[i]) # [(topic_id, prob)]
print(phrased_doc[i])
####
top_topics.sort(key=takelast, reverse=True)
for topic in top_topics:
    print('topic {}'.format(i))
    i+=1
    topic_term_distribution = model.get_topic_terms(topic[0])
    for term in topic_term_distribution:
        print('{}, {}'.format(dictionary[term[0]], term[1]))
    print('prob: {}'.format(topic[-1]))

160
['lecture', 'evaluate', 'text', 'retrieval', 'system', 'multiple', 'levels', 'lecture', 'continue', 'discussion', 'going', 'look', 'evaluate', 'text', 'retrieval', 'system', 'multiple', 'level', 'far', 'talked', 'binary', 'means', 'document', 'judged', 'relevant', 'non_relevant', 'earlier', 'talk', 'relevance', 'matter', 'degree', 'distinguishing', 'high', 'relative', 'useful', 'documents', 'moderately', 'relevant_documents', 'ok', 'non_relevant', 'documents', 'imagine', 'ratings', 'multiple', 'levels', 'example', 'example', 'levels', '3', 'relevant', 'sorry', '3', 'relevant', 'marginally', 'relevant', 'non_relevant', 'evaluate', 'search_engine', 'system', 'judgments', 'obviously', 'map', 'doesnt', 'average_precision', 'doesnt', 'precision_recall', 'doesnt', 'work', 'rely', 'binary', 'lets', 'look', 'ranked', 'results', 'judgments', 'right', 'imagine', 'user', 'care', '10', 'right', 'marked', 'reading', 'levels', 'relevance', 'levels', 'documents', 'shown', '32113', 'reason', 'gain

In [19]:
def takelast(x):
    return x[-1]

In [13]:
from gensim.test.utils import datapath
# Save model to disk.
temp_file = datapath("model2.lda")
model.save(temp_file)

2021-11-27 22:20:07,084 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2021-11-27 22:20:07,084 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)
2021-11-27 22:20:07,085 : DEBUG : starting a new internal lifecycle event log for Dictionary
2021-11-27 22:20:07,085 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2021-11-27T22:20:07.085097', 'gensim': '4.1.3.dev0', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
2021-11-27 22:20:07,137 : DEBUG : starting a new internal lifecycle event log for LdaState
2021-11-27 22:20:07,137 : INFO : LdaState lifecycle event {'fname_or_handle': 'D:\\ProgramFiles\\conda\\lib\\site-packages\\gensim-4.1.3