In [2]:
# Import required libraries
import gensim
from gensim import corpora


In [3]:

# Load the corpus
documents = [
    "This is the first document.",
    "This document is the second document.", 
    "And this is the third one.",
    "Is this the first document?"
    ]


In [4]:

# Preprocess the data
text_data = []
for document in documents:
    text_data.append(document.lower().split())
print(text_data)

[['this', 'is', 'the', 'first', 'document.'], ['this', 'document', 'is', 'the', 'second', 'document.'], ['and', 'this', 'is', 'the', 'third', 'one.'], ['is', 'this', 'the', 'first', 'document?']]


In [8]:

# Create a dictionary from the preprocessed data
dictionary = corpora.Dictionary(text_data)

# Create a document-term matrix from the dictionary
corpus = [dictionary.doc2bow(text) for text in text_data]
for corp in corpus:
    print(corp)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]
[(0, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]
[(2, 1), (3, 1), (4, 1), (7, 1), (8, 1), (9, 1)]
[(1, 1), (2, 1), (3, 1), (4, 1), (10, 1)]


In [11]:

# Train the LDA model
num_topics = 8
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus, 
    id2word=dictionary,
    num_topics=num_topics,
    random_state=100,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha='auto',
    per_word_topics=True
    )


In [12]:

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))


Topic: 0 
Words: 0.172*"is" + 0.172*"the" + 0.172*"this" + 0.091*"document." + 0.091*"first" + 0.091*"document" + 0.091*"second" + 0.091*"document?" + 0.010*"one." + 0.010*"third"
Topic: 1 
Words: 0.091*"is" + 0.091*"this" + 0.091*"the" + 0.091*"first" + 0.091*"document." + 0.091*"one." + 0.091*"second" + 0.091*"third" + 0.091*"document?" + 0.091*"document"
Topic: 2 
Words: 0.091*"is" + 0.091*"this" + 0.091*"the" + 0.091*"document." + 0.091*"first" + 0.091*"document?" + 0.091*"document" + 0.091*"one." + 0.091*"third" + 0.091*"second"
Topic: 3 
Words: 0.153*"and" + 0.153*"third" + 0.153*"one." + 0.152*"this" + 0.152*"the" + 0.152*"is" + 0.017*"first" + 0.017*"document." + 0.017*"second" + 0.017*"document?"
Topic: 4 
Words: 0.091*"is" + 0.091*"the" + 0.091*"this" + 0.091*"first" + 0.091*"document." + 0.091*"second" + 0.091*"one." + 0.091*"document?" + 0.091*"third" + 0.091*"document"
Topic: 5 
Words: 0.177*"first" + 0.177*"document." + 0.176*"the" + 0.176*"this" + 0.176*"is" + 0.020*"doc

: 