In [20]:
### @credit: gensim tutorial on LDA and ensembleLda
import gensim
from spacy.lang.en.stop_words import STOP_WORDS
def tokenize(sentence):
    return [token for token in sentence.split() if token not in STOP_WORDS]
import re
def clean_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
    return re.sub(r'\s{2,}', ' ', sentence)
from gensim.models.phrases import Phrases, Phraser
def build_phrases(sentences):
    phrases = Phrases(sentences,
                      min_count=1,
                      threshold=10,
                      progress_per=1000)
    return Phraser(phrases)
def sentence_to_bi_grams(phrases_model, sentence):
    return ' '.join(phrases_model[sentence])


In [140]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
for handler in logger.handlers:
    handler.setLevel(logging.DEBUG)

In [26]:
p = []
with open("phrases.txt") as f:
    for line in f:
        p.append(line.split())
p = dict(p)

In [161]:
docs = []
with open("textretrieval.txt", encoding='utf-8') as f:
    for line in f:
        docs.append(line.split('.'))
with open("textanalytics.txt") as f:
    for line in f:
        docs.append(line.split('.'))
with open("noncs.dat", encoding='utf-8') as f:
    for line in f:
        docs.append(line.split('.'))
with open("cs125.dat", encoding='utf-8') as f:
    for line in f:
        docs.append(line.split('.'))
with open("bkgd.dat", encoding='utf-8') as f:
    for line in f:
        docs.append(line.split('.'))
print(len(docs))
docs = [[clean_sentence(s) for s in sentences] for sentences in docs]
docs = [[tokenize(s) for s in sentences] for sentences in docs]

phrased = []
for sentences in docs:
    ps = []
    for s in sentences:
        i = 0
        new_s = []
        while i < len(s) - 1:
            if (s[i+1] == p.get(s[i], False)):
                new_s.append(s[i] + "_" + s[i+1])
                i+=2
            else:
                new_s.append(s[i])
                i+=1
        ps.append(new_s)
    phrased.append(ps)
len(phrased)

160


160

In [162]:
phrased_doc = []
for doc in phrased:
    d = []
    for s in doc:
        d+=s
    phrased_doc.append(d)
phrased_doc

[['lecture',
  'natural_language',
  'content',
  'picture',
  'step',
  'process',
  'text',
  'data',
  'text',
  'data',
  'natural',
  'computers',
  'understand',
  'natural_language',
  'extent',
  'order',
  'use',
  'thats',
  'topic',
  'going',
  'cover',
  'natural_language',
  'processing',
  'main',
  'technique',
  'processing',
  'natural_language',
  'obtain',
  'understanding',
  'second',
  'state',
  'art',
  'nlp',
  'stands',
  'natural_language',
  'finally',
  'going',
  'cover',
  'relation',
  'natural_language',
  'processing',
  'text',
  'nlp',
  'best',
  'way',
  'explain',
  'think',
  'text',
  'foreign',
  'language',
  'order',
  'understand',
  'text',
  'basically',
  'computers',
  'facing',
  'right',
  'looking',
  'simple',
  'sentence',
  'like',
  'dog',
  'chasing',
  'boy',
  'dont',
  'problem',
  'understanding',
  'imagine',
  'computer',
  'order',
  'understand',
  'general',
  'know',
  'dogs',
  'noun',
  'chasing',
  'verb',
  'called

In [45]:
from gensim.models import LdaModel

In [228]:
from gensim.corpora import Dictionary
import random

dictionary = Dictionary(phrased_doc)
dictionary.filter_extremes(no_below=2, no_above=0.1)

corpus = [dictionary.doc2bow(doc) for doc in phrased_doc]
num_topics = 86*2
chunksize = 160 # how many documents to process at a time
passes = 20 # epochs
iterations = 30
eval_every = 10

# Make a index to word dictionary.
temp = dictionary[0] 
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)


2021-11-25 19:02:00,764 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2021-11-25 19:02:00,858 : INFO : built Dictionary<12163 unique tokens: ['1st', '90', '97', 'able', 'accuracy']...> from 160 documents (total 216965 corpus positions)
2021-11-25 19:02:00,859 : DEBUG : starting a new internal lifecycle event log for Dictionary
2021-11-25 19:02:00,859 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12163 unique tokens: ['1st', '90', '97', 'able', 'accuracy']...> from 160 documents (total 216965 corpus positions)", 'datetime': '2021-11-25T19:02:00.859931', 'gensim': '4.1.3.dev0', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
2021-11-25 19:02:00,869 : INFO : discarding 6741 tokens: [('able', 87), ('accuracy', 17), ('achieve', 35), ('acquired', 1), ('act', 18), ('actually', 136), ('add', 61), ('additional', 42), ('allow', 73), ('alright', 46)]...
2021-11-25 19:02:00

2021-11-25 19:02:01,572 : DEBUG : bound: at document #0
2021-11-25 19:02:01,796 : INFO : -18.759 per-word bound, 443650.9 perplexity estimate based on a held-out corpus of 160 documents with 51198 words
2021-11-25 19:02:01,796 : INFO : PROGRESS: pass 1, at document #160/160
2021-11-25 19:02:01,797 : DEBUG : performing inference on a chunk of 160 documents
2021-11-25 19:02:01,940 : DEBUG : 110/160 documents converged within 30 iterations
2021-11-25 19:02:01,943 : INFO : optimized alpha [0.0070196334, 0.0064326935, 0.0065632253, 0.0062244306, 0.0062929974, 0.006251213, 0.0066822423, 0.006232281, 0.006347054, 0.0073806327, 0.0063336636, 0.0070867846, 0.006492251, 0.006574016, 0.006520144, 0.0064113834, 0.0065198294, 0.006344076, 0.007108377, 0.006280984, 0.006503741, 0.0064578718, 0.0066319765, 0.0066674766, 0.0066002854, 0.0061828448, 0.006530553, 0.0063648955, 0.0064504547, 0.0065655666, 0.006006851, 0.0063912943, 0.0063980706, 0.006254713, 0.006892901, 0.0066774366, 0.0066480986, 0.006

2021-11-25 19:02:02,730 : INFO : -10.648 per-word bound, 1605.0 perplexity estimate based on a held-out corpus of 160 documents with 51198 words
2021-11-25 19:02:02,731 : INFO : PROGRESS: pass 3, at document #160/160
2021-11-25 19:02:02,731 : DEBUG : performing inference on a chunk of 160 documents
2021-11-25 19:02:02,820 : DEBUG : 158/160 documents converged within 30 iterations
2021-11-25 19:02:02,824 : INFO : optimized alpha [0.0075407424, 0.006211033, 0.006390692, 0.0060167694, 0.006304025, 0.006411362, 0.0071482635, 0.0062798345, 0.0061312043, 0.008763938, 0.006267566, 0.0074540987, 0.006578788, 0.007014094, 0.007060648, 0.0064950925, 0.0065413886, 0.0062026842, 0.007347569, 0.0061062924, 0.006467805, 0.006271992, 0.0068911766, 0.0064875963, 0.006780316, 0.005977926, 0.006302157, 0.0062219417, 0.0068075876, 0.0065861684, 0.005813342, 0.006415556, 0.0061787674, 0.0060450453, 0.007238589, 0.006777434, 0.006970411, 0.0064453683, 0.0061913384, 0.0066947513, 0.0061445916, 0.0059473626,

2021-11-25 19:02:03,616 : INFO : PROGRESS: pass 5, at document #160/160
2021-11-25 19:02:03,616 : DEBUG : performing inference on a chunk of 160 documents
2021-11-25 19:02:03,711 : DEBUG : 159/160 documents converged within 30 iterations
2021-11-25 19:02:03,714 : INFO : optimized alpha [0.007947951, 0.0060392534, 0.006239682, 0.0058554965, 0.006313537, 0.00657746, 0.007527082, 0.006382072, 0.0059637814, 0.009971218, 0.006293729, 0.0077441195, 0.006746875, 0.0073893853, 0.007511807, 0.0065638246, 0.0065763304, 0.006091774, 0.0075439555, 0.005969968, 0.0064392188, 0.0061124214, 0.007125174, 0.0063639004, 0.0069097336, 0.005818714, 0.006125337, 0.0061098, 0.0070918887, 0.006620487, 0.0056627193, 0.0065170624, 0.006008755, 0.0058822627, 0.0074731773, 0.006808178, 0.0072057997, 0.006292601, 0.0060813967, 0.0068952683, 0.0060063032, 0.0057897633, 0.008891266, 0.0073625953, 0.0060122386, 0.0058344137, 0.006090056, 0.0064708814, 0.005781688, 0.005875502, 0.006335643, 0.005663317, 0.0069290595,

2021-11-25 19:02:04,497 : DEBUG : performing inference on a chunk of 160 documents
2021-11-25 19:02:04,597 : DEBUG : 157/160 documents converged within 30 iterations
2021-11-25 19:02:04,600 : INFO : optimized alpha [0.008327239, 0.0058973366, 0.0060882494, 0.005722036, 0.0063219774, 0.0067510433, 0.007864207, 0.006470655, 0.005825368, 0.011029658, 0.0063304054, 0.007981594, 0.0069077555, 0.0077565056, 0.007951287, 0.0066514723, 0.0065927533, 0.005999361, 0.007682051, 0.005856743, 0.006443037, 0.0059670666, 0.0073304726, 0.0063146213, 0.007021948, 0.0056869155, 0.0059793703, 0.006016362, 0.007327698, 0.006650249, 0.005537854, 0.0066050556, 0.0058682603, 0.005747586, 0.007678254, 0.0068347184, 0.0073970333, 0.0061928094, 0.005989772, 0.007070769, 0.0058914362, 0.005659266, 0.009566318, 0.0075857225, 0.005949127, 0.0057019074, 0.00607658, 0.0065323487, 0.0056515518, 0.0058428776, 0.00634461, 0.005538427, 0.0071952073, 0.0068151653, 0.0058232676, 0.006285411, 0.0065351683, 0.0062877294, 0.

2021-11-25 19:02:05,541 : DEBUG : 160/160 documents converged within 30 iterations
2021-11-25 19:02:05,545 : INFO : optimized alpha [0.0087066805, 0.005775691, 0.0059586423, 0.005607479, 0.0063296882, 0.0069075245, 0.008140932, 0.0065497253, 0.005706657, 0.012019222, 0.006363106, 0.00819527, 0.0071205613, 0.008093446, 0.008389412, 0.0067296894, 0.0065945648, 0.0059196036, 0.00779025, 0.0057592704, 0.0064468, 0.005842541, 0.0075295973, 0.0062970067, 0.0071221213, 0.0055737533, 0.0058543347, 0.005935722, 0.007541144, 0.006676822, 0.005430518, 0.006670928, 0.005747805, 0.005632009, 0.007862671, 0.0068583954, 0.007554455, 0.006142224, 0.005910678, 0.0072287046, 0.005792575, 0.0055471947, 0.010197543, 0.0078014024, 0.0058944733, 0.00558815, 0.006076521, 0.0065870653, 0.005539785, 0.0058145747, 0.006352821, 0.005431068, 0.007437546, 0.0068912725, 0.0057046423, 0.0062687406, 0.0066141076, 0.0062346896, 0.0056255013, 0.0067878016, 0.00873265, 0.0061530154, 0.0058893496, 0.0056846747, 0.0053722

2021-11-25 19:02:06,439 : INFO : optimized alpha [0.009060111, 0.0056688613, 0.0058449716, 0.005506754, 0.006336859, 0.007051103, 0.008396626, 0.006621684, 0.0056023546, 0.012970312, 0.0063928445, 0.008391038, 0.007317236, 0.00839295, 0.008801731, 0.006800843, 0.0065964647, 0.005849156, 0.007874443, 0.005673355, 0.006450451, 0.0057332367, 0.007726303, 0.006303884, 0.0072132703, 0.0054742303, 0.0057445904, 0.005864505, 0.007737774, 0.00670101, 0.0053360234, 0.0067186393, 0.0056419997, 0.005530405, 0.00803148, 0.0068799425, 0.0076984153, 0.0061085722, 0.005840807, 0.0073734, 0.0057054604, 0.005448614, 0.010794295, 0.007999694, 0.0058460613, 0.005488115, 0.006087627, 0.0066367514, 0.005441467, 0.005789477, 0.00636047, 0.0053365543, 0.0076617473, 0.006997571, 0.005600413, 0.006264934, 0.0066859922, 0.0062209005, 0.0055241305, 0.0068577295, 0.008992233, 0.0060973563, 0.0058200937, 0.005581171, 0.00527976, 0.0077340906, 0.008017018, 0.008011085, 0.0065044956, 0.011064169, 0.006901854, 0.0058

2021-11-25 19:02:07,309 : DEBUG : updating topics
2021-11-25 19:02:07,410 : INFO : topic #161 (0.005): 0.000*"arraylist" + 0.000*"linked" + 0.000*"lists" + 0.000*"file" + 0.000*"correlated" + 0.000*"copy" + 0.000*"interface" + 0.000*"official" + 0.000*"classifier" + 0.000*"o"
2021-11-25 19:02:07,411 : INFO : topic #64 (0.005): 0.000*"sports" + 0.000*"lists" + 0.000*"arraylist" + 0.000*"linked" + 0.000*"travel" + 0.000*"bar" + 0.000*"interface" + 0.000*"z" + 0.000*"population" + 0.000*"scoring_function"
2021-11-25 19:02:07,412 : INFO : topic #42 (0.011): 0.025*"groups" + 0.014*"applause" + 0.011*"light" + 0.009*"coin" + 0.009*"leads" + 0.008*"tail" + 0.008*"random_variable" + 0.008*"selecting" + 0.008*"present" + 0.008*"painting"
2021-11-25 19:02:07,412 : INFO : topic #69 (0.012): 0.023*"boy" + 0.018*"scored" + 0.016*"root" + 0.016*"ambiguity" + 0.016*"noun" + 0.016*"dog" + 0.014*"machines" + 0.011*"tagging" + 0.011*"chasing" + 0.010*"responsible"
2021-11-25 19:02:07,413 : INFO : topic 

2021-11-25 19:02:08,288 : INFO : topic #168 (0.005): 0.000*"harvard" + 0.000*"truth" + 0.000*"dog" + 0.000*"actionable_knowledge" + 0.000*"pet" + 0.000*"non_text" + 0.000*"errors" + 0.000*"sentiment" + 0.000*"math" + 0.000*"lie"
2021-11-25 19:02:08,289 : INFO : topic #42 (0.012): 0.024*"groups" + 0.014*"applause" + 0.011*"light" + 0.010*"coin" + 0.009*"leads" + 0.008*"tail" + 0.008*"painting" + 0.008*"closest" + 0.008*"present" + 0.008*"representative"
2021-11-25 19:02:08,290 : INFO : topic #69 (0.012): 0.023*"boy" + 0.018*"scored" + 0.016*"noun" + 0.016*"ambiguity" + 0.016*"dog" + 0.015*"root" + 0.015*"machines" + 0.012*"tagging" + 0.011*"chasing" + 0.011*"responsible"
2021-11-25 19:02:08,290 : INFO : topic #9 (0.015): 0.131*"interface" + 0.050*"interfaces" + 0.041*"comparable" + 0.031*"implements" + 0.021*"ad" + 0.012*"abstract" + 0.012*"compiler" + 0.011*"instances" + 0.010*"adder" + 0.008*"documentation"
2021-11-25 19:02:08,293 : INFO : topic diff=0.549148, rho=0.242536
2021-11-25 

2021-11-25 19:02:09,164 : INFO : topic #42 (0.012): 0.024*"groups" + 0.014*"applause" + 0.011*"light" + 0.010*"coin" + 0.009*"leads" + 0.008*"tail" + 0.008*"painting" + 0.008*"closest" + 0.008*"representative" + 0.008*"present"
2021-11-25 19:02:09,164 : INFO : topic #69 (0.013): 0.024*"boy" + 0.018*"scored" + 0.016*"noun" + 0.016*"dog" + 0.016*"ambiguity" + 0.015*"root" + 0.015*"machines" + 0.012*"tagging" + 0.011*"chasing" + 0.011*"responsible"
2021-11-25 19:02:09,165 : INFO : topic #9 (0.016): 0.132*"interface" + 0.050*"interfaces" + 0.041*"comparable" + 0.032*"implements" + 0.021*"ad" + 0.012*"abstract" + 0.012*"compiler" + 0.011*"instances" + 0.010*"adder" + 0.008*"documentation"
2021-11-25 19:02:09,168 : INFO : topic diff=0.336063, rho=0.229416
2021-11-25 19:02:09,225 : DEBUG : bound: at document #0
2021-11-25 19:02:09,409 : INFO : -9.076 per-word bound, 539.8 perplexity estimate based on a held-out corpus of 160 documents with 51198 words
2021-11-25 19:02:09,409 : INFO : PROGRESS

2021-11-25 19:02:10,035 : INFO : topic #69 (0.013): 0.024*"boy" + 0.018*"scored" + 0.016*"dog" + 0.016*"noun" + 0.016*"ambiguity" + 0.015*"root" + 0.015*"machines" + 0.012*"tagging" + 0.011*"chasing" + 0.011*"john"
2021-11-25 19:02:10,036 : INFO : topic #9 (0.016): 0.132*"interface" + 0.050*"interfaces" + 0.041*"comparable" + 0.032*"implements" + 0.021*"ad" + 0.012*"abstract" + 0.012*"compiler" + 0.011*"instances" + 0.010*"adder" + 0.008*"documentation"
2021-11-25 19:02:10,039 : INFO : topic diff=0.210966, rho=0.218218
2021-11-25 19:02:10,067 : DEBUG : starting a new internal lifecycle event log for LdaModel
2021-11-25 19:02:10,068 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel<num_terms=5422, num_topics=172, decay=0.5, chunksize=160> in 9.07s', 'datetime': '2021-11-25T19:02:10.067931', 'gensim': '4.1.3.dev0', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [234]:
print(len(phrased_doc))
i = 12
i-=1
top_topics = model.get_document_topics(corpus[i]) # [(topic_id, prob)]
print(phrased_doc[i])
####
top_topics.sort(key=takelast, reverse=True)
for topic in top_topics:
    print('topic {}'.format(i))
    i+=1
    topic_term_distribution = model.get_topic_terms(topic[0])
    for term in topic_term_distribution:
        print('{}, {}'.format(dictionary[term[0]], term[1]))
    print('prob: {}'.format(topic[-1]))

160
['lecture', 'fast', 'search', 'inverted', 'lecture', 'going', 'continue', 'discussion', 'system', 'particular', 'going', 'talk', 'support', 'fast', 'search', 'inverted', 'lets', 'think', 'general', 'scoring_function', 'look', 'course', 'vector_space', 'model', 'special', 'case', 'imagine', 'retrieval', 'functions', 'form', 'function', 'scoring_function', 'document', 'd', 'query', 'q', 'defined', 'function', 'thats', 'adjustment', 'function', 'consider', 'factors', 'shown', 'end', 'f', 'sub', 'd', 'd', 'f', 'sub', 'q', 'adjustment', 'factors', 'document', 'query', 'level', 'document', 'inside', 'function', 'theres', 'function', 'called', 'main', 'scoring_function', 'said', 'scoring', 'factors', 'level', 'document', 'example', 'document', 'aggregate', 'functioning', 'inside', 'edge', 'function', 'functions', 'compute', 'weights', 'contribution', 'matched', 'query', 'term', 'function', 'g', 'gives', 'weight', 'match', 'query', 'term', 'ti', 'document', 'h', 'function', 'aggregate', 'w

In [146]:
def takelast(x):
    return x[-1]

In [143]:
from gensim.test.utils import datapath
# Save model to disk.
temp_file = datapath("model2.lda")
model.save(temp_file)

2021-11-25 14:30:32,379 : DEBUG : starting a new internal lifecycle event log for LdaState
2021-11-25 14:30:32,380 : INFO : LdaState lifecycle event {'fname_or_handle': 'D:\\ProgramFiles\\conda\\lib\\site-packages\\gensim-4.1.3.dev0-py3.8-win-amd64.egg\\gensim\\test\\test_data\\model2.lda.state', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-11-25T14:30:32.379177', 'gensim': '4.1.3.dev0', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'saving'}
2021-11-25 14:30:32,380 : DEBUG : {'uri': 'D:\\ProgramFiles\\conda\\lib\\site-packages\\gensim-4.1.3.dev0-py3.8-win-amd64.egg\\gensim\\test\\test_data\\model2.lda.state', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
2021-11-25 14:30:32,385 : INFO : saved D:\ProgramFiles\conda\lib\site-packages\gensim-4.1.3.dev