In [64]:
# Import necessary libraries
import spacy
import gensim
from gensim import corpora
from pprint import pprint

# Load spaCy's English NLP model
# do this before running
# python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

### Sample documents

In [65]:
documents  = [
    "Natural language processing is a subfield of artificial intelligence.",
    "Latent Dirichlet Allocation is a generative probabilistic model.",
    "Topic modeling is used to identify topics present in a corpus of text.",
    "Gensim is a popular Python library for topic modeling and document similarity.",
    "Dota 2 is a popular MOBA PC game available on Steam. It is known for being difficult and non-beginner friendly gamplay mechanics.",
    "Do something now that your future you will be thankful for.",
    "Nothing will ever change. Everything will stay the same. You will always be on the same spot unless you take the first step."
]

### Preprocess the documents

In [66]:
def preprocess(text):
    # Tokenize and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return tokens

# Apply preprocessing to all documents
processed_documents = [preprocess(doc) for doc in documents2]

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

# Build LDA model
lda_model_3 = gensim.models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)

lda_model_6 = gensim.models.LdaModel(corpus, num_topics=6, id2word=dictionary, passes=15)

lda_model_9 = gensim.models.LdaModel(corpus, num_topics=9, id2word=dictionary, passes=15)

### Print topics and their keywords

In [67]:
print('LDA Model 3 topics')
pprint(lda_model_3.print_topics())

LDA Model 3 topics
[(0,
  '0.081*"topic" + 0.056*"modeling" + 0.032*"similarity" + 0.032*"document" + '
  '0.032*"Gensim" + 0.032*"library" + 0.032*"Python" + 0.032*"present" + '
  '0.032*"text" + 0.032*"identify"'),
 (1,
  '0.023*"future" + 0.023*"thankful" + 0.023*"change" + 0.023*"spot" + '
  '0.023*"stay" + 0.023*"step" + 0.023*"popular" + 0.023*"Dirichlet" + '
  '0.023*"probabilistic" + 0.023*"generative"'),
 (2,
  '0.039*"popular" + 0.039*"MOBA" + 0.039*"mechanic" + 0.039*"available" + '
  '0.039*"friendly" + 0.039*"pc" + 0.039*"Dota" + 0.039*"Steam" + '
  '0.039*"gamplay" + 0.039*"game"')]


In [68]:
print('LDA Model 6 topics')
pprint(lda_model_6.print_topics())

LDA Model 6 topics
[(0,
  '0.127*"thankful" + 0.127*"future" + 0.018*"spot" + 0.018*"stay" + '
  '0.018*"change" + 0.018*"step" + 0.018*"topic" + 0.018*"modeling" + '
  '0.018*"popular" + 0.018*"artificial"'),
 (1,
  '0.153*"topic" + 0.082*"modeling" + 0.082*"text" + 0.082*"corpus" + '
  '0.082*"identify" + 0.082*"present" + 0.012*"future" + 0.012*"spot" + '
  '0.012*"thankful" + 0.012*"stay"'),
 (2,
  '0.055*"popular" + 0.055*"non" + 0.055*"mechanic" + 0.055*"beginner" + '
  '0.055*"Dota" + 0.055*"pc" + 0.055*"Steam" + 0.055*"difficult" + '
  '0.055*"game" + 0.055*"gamplay"'),
 (3,
  '0.089*"probabilistic" + 0.089*"Latent" + 0.089*"Dirichlet" + '
  '0.089*"Allocation" + 0.089*"generative" + 0.089*"model" + 0.013*"thankful" '
  '+ 0.013*"future" + 0.013*"topic" + 0.013*"stay"'),
 (4,
  '0.089*"natural" + 0.089*"intelligence" + 0.089*"subfield" + '
  '0.089*"language" + 0.089*"processing" + 0.089*"artificial" + 0.013*"future" '
  '+ 0.013*"thankful" + 0.013*"topic" + 0.013*"spot"'),
 (5

In [69]:
print('LDA Model 9 topics')
pprint(lda_model_9.print_topics())

LDA Model 9 topics
[(0,
  '0.119*"topic" + 0.063*"Dirichlet" + 0.063*"probabilistic" + 0.063*"model" + '
  '0.063*"Latent" + 0.063*"generative" + 0.063*"Allocation" + 0.062*"corpus" + '
  '0.062*"present" + 0.062*"identify"'),
 (1,
  '0.023*"future" + 0.023*"thankful" + 0.023*"modeling" + 0.023*"topic" + '
  '0.023*"step" + 0.023*"stay" + 0.023*"popular" + 0.023*"present" + '
  '0.023*"spot" + 0.023*"change"'),
 (2,
  '0.103*"language" + 0.103*"artificial" + 0.103*"subfield" + '
  '0.103*"intelligence" + 0.103*"processing" + 0.103*"natural" + '
  '0.010*"future" + 0.010*"thankful" + 0.010*"topic" + 0.010*"change"'),
 (3,
  '0.059*"gamplay" + 0.059*"Steam" + 0.059*"beginner" + 0.059*"pc" + '
  '0.059*"non" + 0.059*"difficult" + 0.059*"game" + 0.059*"MOBA" + '
  '0.059*"available" + 0.059*"friendly"'),
 (4,
  '0.023*"future" + 0.023*"thankful" + 0.023*"stay" + 0.023*"topic" + '
  '0.023*"modeling" + 0.023*"step" + 0.023*"change" + 0.023*"probabilistic" + '
  '0.023*"popular" + 0.023*"art

### Assign topics to documents

In [70]:
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model_3.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.047890186), (1, 0.04845486), (2, 0.903655)]
Document 2 - Topic: [(0, 0.9033362), (1, 0.04864654), (2, 0.04801727)]
Document 3 - Topic: [(0, 0.91578436), (1, 0.042299073), (2, 0.04191659)]
Document 4 - Topic: [(0, 0.9242065), (1, 0.03769536), (2, 0.038098145)]
Document 5 - Topic: [(0, 0.022521215), (1, 0.022605065), (2, 0.9548737)]
Document 6 - Topic: [(0, 0.7741827), (1, 0.11371442), (2, 0.11210286)]
Document 7 - Topic: [(0, 0.8646352), (1, 0.068131074), (2, 0.06723372)]


In [71]:
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model_6.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.023810547), (1, 0.023810193), (2, 0.023809984), (3, 0.023810241), (4, 0.880949), (5, 0.023810025)]
Document 2 - Topic: [(0, 0.023810547), (1, 0.023810193), (2, 0.023809984), (3, 0.880949), (4, 0.023810241), (5, 0.023810025)]
Document 3 - Topic: [(0, 0.020834135), (1, 0.895785), (2, 0.020833693), (3, 0.020833895), (4, 0.020833895), (5, 0.02087945)]
Document 4 - Topic: [(0, 0.018519677), (1, 0.018652756), (2, 0.01854244), (3, 0.018519329), (4, 0.018519329), (5, 0.90724653)]
Document 5 - Topic: [(0, 0.011111867), (1, 0.011111605), (2, 0.944432), (3, 0.01111164), (4, 0.01111164), (5, 0.011121212)]
Document 6 - Topic: [(0, 0.7222168), (1, 0.05555674), (2, 0.055556368), (3, 0.05555683), (4, 0.05555683), (5, 0.055556446)]
Document 7 - Topic: [(0, 0.033335503), (1, 0.033334747), (2, 0.033334304), (3, 0.033334848), (4, 0.033334848), (5, 0.83332574)]


In [72]:
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model_9.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.015873047), (1, 0.01587305), (2, 0.87301564), (3, 0.015873047), (4, 0.01587305), (5, 0.015873048), (6, 0.015873048), (7, 0.01587305), (8, 0.015873047)]
Document 2 - Topic: [(0, 0.87301546), (1, 0.015873069), (2, 0.015873065), (3, 0.015873063), (4, 0.015873069), (5, 0.015873067), (6, 0.015873065), (7, 0.015873069), (8, 0.015873065)]
Document 3 - Topic: [(0, 0.88888454), (1, 0.013888928), (2, 0.013888925), (3, 0.013888924), (4, 0.013888928), (5, 0.013888926), (6, 0.013888925), (7, 0.013888928), (8, 0.0138929635)]
Document 4 - Topic: [(0, 0.012348668), (1, 0.01234571), (2, 0.012345708), (3, 0.012346512), (4, 0.01234571), (5, 0.012345709), (6, 0.012345708), (7, 0.01234571), (8, 0.9012306)]
Document 5 - Topic: [(3, 0.9407399)]
Document 6 - Topic: [(0, 0.037037082), (1, 0.037037086), (2, 0.037037082), (3, 0.037037082), (4, 0.037037086), (5, 0.7037033), (6, 0.037037082), (7, 0.037037086), (8, 0.037037082)]
Document 7 - Topic: [(0, 0.022222258), (1, 0.02222226), (2, 

### Discussion

By adding more topics, the weight of each topic becomes smaller. There are some where the weight of a topic has a significanly larger share than the rest regardless of the number of topics. It is essential to assess the quality of topics for a better interpretability of the results