Follow the example from https://medium.com/nanonets/topic-modeling-with-lsa-psla-lda-and-lda2vec-555ff65b0b05


In [None]:
from gensim.corpora import Dictionary as D
from gensim.corpora import MmCorpus
from gensim.models.ldamodel import LdaModel

document = "This is some document..."

# load id->word mapping (the dictionary)
id2word = D.load_from_text('wiki_en_wordids.txt')

# load corpus iterator
mm = MmCorpus('wiki_en_tfidf.mm')

# extract 100 LDA topics, updating once every 10,000
lda = LdaModel(corpus=mm, id2word=id2word, num_topics=100, update_every=1, chunksize=10000, passes=1)

# use LDA model: transform new doc to bag-of-words, then apply lda
doc_bow = D.doc2bow(document.split())
doc_lda = lda[doc_bow]

# doc_lda is vector of length num_topics representing weighted presence of each topic in the doc

In [None]:
__import__("gensim.corpora")

https://radimrehurek.com/gensim/tutorial.html

In [None]:
from gensim import corpora, models, similarities

In [None]:
>>> from gensim import corpora, models, similarities
>>>
>>> corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
>>>           [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
>>>           [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
>>>           [(0, 1.0), (4, 2.0), (7, 1.0)],
>>>           [(3, 1.0), (5, 1.0), (6, 1.0)],
>>>           [(9, 1.0)],
>>>           [(9, 1.0), (10, 1.0)],
>>>           [(9, 1.0), (10, 1.0), (11, 1.0)],
>>>           [(8, 1.0), (10, 1.0), (11, 1.0)]]

In [None]:
corpus

In [None]:
>>> tfidf = models.TfidfModel(corpus)

In [None]:
>>> vec = [(0, 1), (4, 1)]
>>> print(tfidf[vec])

In [None]:
>>> index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)

In [None]:
>>> sims = index[tfidf[vec]]
>>> print(list(enumerate(sims)))

In [None]:
print(tfidf[[(0, 2), (1, 2)], [(0, 2), (3, 2)]])

## Tutorial 1

https://radimrehurek.com/gensim/tut1.html

In [None]:
>>> documents = ["Human machine interface for lab abc computer applications",
>>>              "A survey of user opinion of computer system response time",
>>>              "The EPS user interface management system",
>>>              "System and human system engineering testing of EPS",
>>>              "Relation of user perceived response time to error measurement",
>>>              "The generation of random binary unordered trees",
>>>              "The intersection graph of paths in trees",
>>>              "Graph minors IV Widths of trees and well quasi ordering",
>>>              "Graph minors A survey"]


In [None]:
>>> # remove common words and tokenize
>>> stoplist = set('for a of the and to in'.split())
>>> texts = [[word for word in document.lower().split() if word not in stoplist]
>>>          for document in documents]
>>>
>>> # remove words that appear only once
>>> from collections import defaultdict
>>> frequency = defaultdict(int)
>>> for text in texts:
>>>     for token in text:
>>>         frequency[token] += 1
>>>
>>> texts = [[token for token in text if frequency[token] > 1]
>>>          for text in texts]
>>>
>>> from pprint import pprint  # pretty-printer
>>> pprint(texts)

create dictionary.

In [None]:
>>> dictionary = corpora.Dictionary(texts)
>>> dictionary.save('deerwester.dict')
>>> print(dictionary)

In [None]:
>>> corpus = [dictionary.doc2bow(text) for text in texts]
>>> corpora.MmCorpus.serialize('deerwester.mm', corpus)
>>> print(corpus)

This is a quick insight into dealing with corpus.

We need to convert our documents to text streams of a single line for easy iteratation.  This could be from pdftotext or the python web crawler

a question is how do we build a dictionary that could grow over time.  if i add new documents with unknown words how do they comparer to earlier documents?

## Tutorial 2: transformers

https://radimrehurek.com/gensim/tut2.html

In [None]:
import os

In [None]:
>>> from gensim import corpora, models, similarities
>>> if (os.path.exists("deerwester.dict")):
>>>    dictionary = corpora.Dictionary.load('deerwester.dict')
>>>    corpus = corpora.MmCorpus('deerwester.mm')
>>>    print("Used files generated from first tutorial")
>>> else:
>>>    print("Please run first tutorial to generate data set")

In [None]:
>>> tfidf = models.TfidfModel(corpus)

In [None]:
>>> corpus_tfidf = tfidf[corpus]
>>> for doc in corpus_tfidf:
...     print(doc)

In [None]:
>>> lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
>>> corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

In [None]:
>>> lsi.print_topics(2)

In [None]:
>>> for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
...     print(doc)

## Tutorial 3: similarity queries

https://radimrehurek.com/gensim/tut3.html

In [None]:
>>> from gensim import corpora, models, similarities
>>> if (os.path.exists("deerwester.dict")):
>>>    dictionary = corpora.Dictionary.load('deerwester.dict')
>>>    corpus = corpora.MmCorpus('deerwester.mm')
>>>    print("Used files generated from first tutorial")
>>> else:
>>>    print("Please run first tutorial to generate data set")

In [None]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

In [None]:
>>> doc = "Human computer interaction"
>>> vec_bow = dictionary.doc2bow(doc.lower().split())
>>> vec_lsi = lsi[vec_bow] # convert the query to LSI space
>>> print(vec_lsi)

In [None]:
>>> index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it

Persist index

In [None]:
>>> index.save('deerwester.index')
>>> index = similarities.MatrixSimilarity.load('deerwester.index')

In [None]:
>>> sims = index[vec_lsi] # perform a similarity query against the corpus
>>> print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples

In [None]:
>>> sims = sorted(enumerate(sims), key=lambda item: -item[1])
>>> print(sims) # print sorted (document number, similarity score) 2-tuples